<?xml version="1.0" encoding="UTF-8" ?>
<rss version="2.0">
<channel>
<title><![CDATA[沧海一粟]]></title> 
<link>http://www.dzhope.com/index.php</link> 
<description><![CDATA[Web系统架构与服务器运维,php开发]]></description> 
<language>zh-cn</language> 
<copyright><![CDATA[沧海一粟]]></copyright>
<item>
<link>http://www.dzhope.com/post//</link>
<title><![CDATA[PHP实现敏感词过滤系统]]></title> 
<author>jed &lt;jed521@163.com&gt;</author>
<category><![CDATA[服务器技术]]></category>
<pubDate>Sun, 06 Nov 2016 14:44:31 +0000</pubDate> 
<guid>http://www.dzhope.com/post//</guid> 
<description>
<![CDATA[ 
	安装说明<br/><br/>安装PHP扩展 trie_filter，安装教程 <a href="http://blog.41ms.com/post/39.html" target="_blank">http://blog.41ms.com/post/39.html</a><br/>安装PHP扩展 swoole，安装教程 <a href="http://www.swoole.com/" target="_blank">http://www.swoole.com/</a><br/><br/>代码说明<br/><br/><br/>1、敏感词库维护更新脚本：<br/><br/>reload_dict.php，提供自动更新字典库到trie-tree文件的过程<br/><div class="code"><br/>&lt;?php<br/><br/>// 设置内存<br/>ini_set(&#039;memory_limit&#039;, &#039;128M&#039;);<br/><br/>// 读取敏感词字典库<br/>$handle = fopen(&#039;dict.txt&#039;, &#039;r&#039;);<br/><br/>// 生成空的trie-tree-filter<br/>$resTrie = trie_filter_new();<br/><br/>while(! feof($handle)) &#123;<br/>&nbsp;&nbsp;&nbsp;&nbsp;$item = trim(fgets($handle));<br/><br/>&nbsp;&nbsp;&nbsp;&nbsp;if (empty($item)) &#123;<br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;continue;<br/>&nbsp;&nbsp;&nbsp;&nbsp;&#125;<br/><br/>&nbsp;&nbsp;&nbsp;&nbsp;// 把敏感词逐个加入trie-tree<br/>&nbsp;&nbsp;&nbsp;&nbsp;trie_filter_store($resTrie, $item);<br/>&#125;<br/><br/>// 生成trie-tree文件<br/>$blackword_tree = &#039;blackword.tree&#039;;<br/><br/>trie_filter_save($resTrie, $blackword_tree);<br/></div><br/><br/>2、trie树对象获取工具类<br/><br/>FilterHelper.php，提供获取trie-tree对象，避免重复生成trie-tree对象和保证tree文件与敏感词库的同步更新<br/><div class="code"><br/>&lt;?php<br/>/**<br/> * 过滤器助手<br/> *<br/> * getResTrie 提供trie-tree对象;<br/> * getFilterWords 提取过滤出的字符串<br/> *<br/> * @author W.Y.P (wangyupeng@jiayuan.com)<br/> */<br/><br/><br/>class FilterHelper<br/>&#123;<br/><br/>&nbsp;&nbsp;&nbsp;&nbsp;// trie-tree对象<br/>&nbsp;&nbsp;&nbsp;&nbsp;private static $_resTrie = null;<br/>&nbsp;&nbsp;&nbsp;&nbsp;// 字典树的更新时间<br/>&nbsp;&nbsp;&nbsp;&nbsp;private static $_mtime = null;<br/><br/><br/>&nbsp;&nbsp;&nbsp;&nbsp;/**<br/>&nbsp;&nbsp;&nbsp;&nbsp; * 防止初始化<br/>&nbsp;&nbsp;&nbsp;&nbsp; */<br/>&nbsp;&nbsp;&nbsp;&nbsp;private function __construct() &#123;&#125;<br/><br/><br/>&nbsp;&nbsp;&nbsp;&nbsp;/**<br/>&nbsp;&nbsp;&nbsp;&nbsp; * 防止克隆对象<br/>&nbsp;&nbsp;&nbsp;&nbsp; */<br/>&nbsp;&nbsp;&nbsp;&nbsp;private function __clone() &#123;&#125;<br/><br/><br/>&nbsp;&nbsp;&nbsp;&nbsp;/**<br/>&nbsp;&nbsp;&nbsp;&nbsp; * 提供trie-tree对象<br/>&nbsp;&nbsp;&nbsp;&nbsp; *<br/>&nbsp;&nbsp;&nbsp;&nbsp; * @param $tree_file 字典树文件路径<br/>&nbsp;&nbsp;&nbsp;&nbsp; * @param $new_mtime 当前调用时字典树的更新时间<br/>&nbsp;&nbsp;&nbsp;&nbsp; * @return null<br/>&nbsp;&nbsp;&nbsp;&nbsp; */<br/>&nbsp;&nbsp;&nbsp;&nbsp;static public function getResTrie($tree_file, $new_mtime) &#123;<br/><br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;if (is_null(self::$_mtime)) &#123;<br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;self::$_mtime = $new_mtime;<br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&#125;<br/><br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;if (($new_mtime != self::$_mtime) &#124;&#124; is_null(self::$_resTrie)) &#123;<br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;self::$_resTrie = trie_filter_load($tree_file);<br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;self::$_mtime = $new_mtime;<br/><br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;// 输出字典文件重载时间<br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;echo date(&#039;Y-m-d H:i:s&#039;) . &quot;&#92;tdictionary reload success!&#92;n&quot;;<br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&#125;<br/><br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;return self::$_resTrie;<br/>&nbsp;&nbsp;&nbsp;&nbsp;&#125;<br/><br/><br/>&nbsp;&nbsp;&nbsp;&nbsp;/**<br/>&nbsp;&nbsp;&nbsp;&nbsp; * 从原字符串中提取过滤出的敏感词<br/>&nbsp;&nbsp;&nbsp;&nbsp; *<br/>&nbsp;&nbsp;&nbsp;&nbsp; * @param $str 原字符串<br/>&nbsp;&nbsp;&nbsp;&nbsp; * @param $res 1-3 表示 从位置1开始，3个字符长度<br/>&nbsp;&nbsp;&nbsp;&nbsp; * @return array<br/>&nbsp;&nbsp;&nbsp;&nbsp; */<br/>&nbsp;&nbsp;&nbsp;&nbsp;static public function getFilterWords($str, $res)<br/>&nbsp;&nbsp;&nbsp;&nbsp;&#123;<br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;$result = array();<br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;foreach ($res as $k =&gt; $v) &#123;<br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;$word = substr($str, $v&#91;0&#93;, $v&#91;1&#93;);<br/><br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;if (!in_array($word, $result)) &#123;<br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;$result&#91;&#93; = $word;<br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&#125;<br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&#125;<br/><br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;return $result;<br/>&nbsp;&nbsp;&nbsp;&nbsp;&#125;<br/>&#125;<br/></div><br/><br/>3、对外提供过滤HTTP访问接口<br/><br/>filter.php，使用swool，对外提交过滤接口访问<br/><br/><div class="code"><br/>&lt;?php<br/><br/>// 设置脚本最大运行内存，根据字典大小调整<br/>ini_set(&#039;memory_limit&#039;, &#039;512M&#039;);<br/><br/>// 设置时区<br/>date_default_timezone_set(&#039;Asia/Shanghai&#039;);<br/><br/>// 加载助手文件<br/>require_once(&#039;FilterHelper.php&#039;);<br/><br/>// http服务绑定的ip及端口<br/>$serv = new swoole_http_server(&quot;182.92.177.16&quot;, 9502);<br/><br/><br/>/**<br/> * 处理请求<br/> */<br/>$serv-&gt;on(&#039;Request&#039;, function($request, $response) &#123;<br/><br/>&nbsp;&nbsp;&nbsp;&nbsp;// 接收get请求参数<br/>&nbsp;&nbsp;&nbsp;&nbsp;$content = isset($request-&gt;get&#91;&#039;content&#039;&#93;) ? $request-&gt;get&#91;&#039;content&#039;&#93;: &#039;&#039;;<br/><br/>&nbsp;&nbsp;&nbsp;&nbsp;$result = &#039;&#039;;<br/><br/>&nbsp;&nbsp;&nbsp;&nbsp;if (!empty($content)) &#123;<br/><br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;// 字典树文件路径，默认当时目录下<br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;$tree_file = &#039;blackword.tree&#039;;<br/><br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;// 清除文件状态缓存<br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;clearstatcache();<br/><br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;// 获取请求时，字典树文件的修改时间<br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;$new_mtime = filemtime($tree_file);<br/><br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;// 获取最新trie-tree对象<br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;$resTrie = FilterHelper::getResTrie($tree_file, $new_mtime);<br/><br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;// 执行过滤<br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;$arrRet = trie_filter_search_all($resTrie, $content);<br/><br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;// 提取过滤出的敏感词<br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;$a_data = FilterHelper::getFilterWords($content, $arrRet);<br/><br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;$result = json_encode($a_data);<br/>&nbsp;&nbsp;&nbsp;&nbsp;&#125;<br/><br/>&nbsp;&nbsp;&nbsp;&nbsp;// 定义http服务信息及响应处理结果<br/>&nbsp;&nbsp;&nbsp;&nbsp;$response-&gt;cookie(&quot;User&quot;, &quot;W.Y.P&quot;);<br/>&nbsp;&nbsp;&nbsp;&nbsp;$response-&gt;header(&quot;X-Server&quot;, &quot;W.Y.P WebServer(Unix) (Red-Hat/Linux)&quot;);<br/>&nbsp;&nbsp;&nbsp;&nbsp;$response-&gt;header(&#039;Content-Type&#039;, &#039;Content-Type: text/html; charset=utf-8&#039;);<br/>&nbsp;&nbsp;&nbsp;&nbsp;$response-&gt;end($result);<br/>&#125;);<br/><br/>$serv-&gt;start();<br/><br/></div><br/><br/>测试效果<br/><br/>词库内容：<br/><br/><a href="http://www.dzhope.com/attachment.php?fid=84" target="_blank"><img src="http://www.dzhope.com/attachment.php?fid=84" class="insertimage" alt="点击在新窗口中浏览此图片" title="点击在新窗口中浏览此图片" border="0"/></a><br/><br/>接口响应过滤结果：<br/><br/><a href="http://www.dzhope.com/attachment.php?fid=85" target="_blank"><img src="http://www.dzhope.com/attachment.php?fid=85" class="insertimage" alt="点击在新窗口中浏览此图片" title="点击在新窗口中浏览此图片" border="0"/></a><br/><br/>尝试更新敏感词库，接口程序已自动加载最新敏感词库，保证过滤效果<br/><br/><a href="http://www.dzhope.com/attachment.php?fid=86" target="_blank"><img src="http://www.dzhope.com/attachment.php?fid=86" class="insertimage" alt="点击在新窗口中浏览此图片" title="点击在新窗口中浏览此图片" border="0"/></a><br/><br/>ab测试结果<br/><br/>词库：200W敏感词<br/>服务器配置（CPU：1核；内存：1024 MB；带宽：1Mbps）<br/><a href="http://www.dzhope.com/attachment.php?fid=87" target="_blank"><img src="http://www.dzhope.com/attachment.php?fid=87" class="insertimage" alt="点击在新窗口中浏览此图片" title="点击在新窗口中浏览此图片" border="0"/></a><br/>
]]>
</description>
</item><item>
<link>http://www.dzhope.com/post//#blogcomment</link>
<title><![CDATA[[评论] PHP实现敏感词过滤系统]]></title> 
<author> &lt;user@domain.com&gt;</author>
<category><![CDATA[评论]]></category>
<pubDate>Thu, 01 Jan 1970 00:00:00 +0000</pubDate> 
<guid>http://www.dzhope.com/post//#blogcomment</guid> 
<description>
<![CDATA[ 
	
]]>
</description>
</item>
</channel>
</rss>