PHP采集程序中常用的函数

Filed under: Spider |
Posted on
  1. <?php
  2. //获得当前的脚本网址
  3. function get_php_url(){
  4.         if(!empty($_SERVER["REQUEST_URI"])){
  5.                 $scriptName = $_SERVER["REQUEST_URI"];
  6.                 $nowurl = $scriptName;
  7.         }else{
  8.                 $scriptName = $_SERVER["PHP_SELF"];
  9.                 if(empty($_SERVER["QUERY_STRING"])) $nowurl = $scriptName;
  10.                 else $nowurl = $scriptName."?".$_SERVER["QUERY_STRING"];
  11.         }
  12.         return $nowurl;
  13. }
  14. //把全角数字转为半角数字
  15. function GetAlabNum($fnum){
  16.         $nums = array("","","","","","","","","","");
  17.         $fnums = "0123456789";
  18.         for($i=0;$i<=9;$i++) $fnum = str_replace($nums[$i],$fnums[$i],$fnum);
  19.         $fnum = ereg_replace("[^0-9\.]|^0{1,}","",$fnum);
  20.         if($fnum=="") $fnum=0;
  21.         return $fnum;
  22. }

  1. //去除HTML标记
  2. function Text2Html($txt){
  3.         $txt = str_replace("  "," ",$txt);
  4.         $txt = str_replace("<","&lt;",$txt);
  5.         $txt = str_replace(">","&gt;",$txt);
  6.         $txt = preg_replace("/[\r\n]{1,}/isU","<br/>\r\n",$txt);
  7.         return $txt;
  8. }
  9.  
  10. //清除HTML标记
  11. function ClearHtml($str){
  12.         $str = str_replace('<','&lt;',$str);
  13.         $str = str_replace('>','&gt;',$str);
  14.         return $str;
  15. }
  16. //相对路径转化成绝对路径
  17. function relative_to_absolute($content, $feed_url) {
  18.     preg_match('/(http|https|ftp):\/\//', $feed_url, $protocol);
  19.     $server_url = preg_replace("/(http|https|ftp|news):\/\//", "", $feed_url);
  20.     $server_url = preg_replace("/\/.*/", "", $server_url);
  21.  
  22.     if ($server_url == '') {
  23.         return $content;
  24.     }
  25.  
  26.     if (isset($protocol[0])) {
  27.         $new_content = preg_replace('/href="\//', 'href="'.$protocol[0].$server_url.'/', $content);
  28.         $new_content = preg_replace('/src="\//', 'src="'.$protocol[0].$server_url.'/', $new_content);
  29.     } else {
  30.         $new_content = $content;
  31.     }
  32.     return $new_content;
  33. }
  34. //取得所有链接
  35. function get_all_url($code){
  36.         preg_match_all('/<a\s+href=["|\']?([^>"\' ]+)["|\']?\s*[^>]*>([^>]+)<\/a>/i',$code,$arr);
  37.         return array('name'=>$arr[2],'url'=>$arr[1]);
  38. }
  39.  
  40. //获取指定标记中的内容
  41. function get_tag_data($str, $start, $end){
  42.         if ( $start == '' || $end == '' ){
  43.                return;
  44.         }
  45.         $str = explode($start, $str);
  46.         $str = explode($end, $str[1]);
  47.         return $str[0];
  48. }
  49. //HTML表格的每行转为CSV格式数组
  50. function get_tr_array($table) {
  51.         $table = preg_replace("'<td[^>]*?>'si",'"',$table);
  52.         $table = str_replace("</td>",'",',$table);
  53.         $table = str_replace("</tr>","{tr}",$table);
  54.         //去掉 HTML 标记
  55.         $table = preg_replace("'<[\/\!]*?[^<>]*?>'si","",$table);
  56.         //去掉空白字符
  57.         $table = preg_replace("'([\r\n])[\s]+'","",$table);
  58.         $table = str_replace(" ","",$table);
  59.         $table = str_replace(" ","",$table);
  60.  
  61.         $table = explode(",{tr}",$table);
  62.         array_pop($table);
  63.         return $table;
  64. }
  65.  
  66. //将HTML表格的每行每列转为数组,采集表格数据
  67. function get_td_array($table) {
  68.         $table = preg_replace("'<table[^>]*?>'si","",$table);
  69.         $table = preg_replace("'<tr[^>]*?>'si","",$table);
  70.         $table = preg_replace("'<td[^>]*?>'si","",$table);
  71.         $table = str_replace("</tr>","{tr}",$table);
  72.         $table = str_replace("</td>","{td}",$table);
  73.         //去掉 HTML 标记
  74.         $table = preg_replace("'<[\/\!]*?[^<>]*?>'si","",$table);
  75.         //去掉空白字符
  76.         $table = preg_replace("'([\r\n])[\s]+'","",$table);
  77.         $table = str_replace(" ","",$table);
  78.         $table = str_replace(" ","",$table);
  79.  
  80.         $table = explode('{tr}', $table);
  81.         array_pop($table);
  82.         foreach ($table as $key=>$tr) {
  83.                 $td = explode('{td}', $tr);
  84.                 array_pop($td);
  85.             $td_array[] = $td;
  86.         }
  87.         return $td_array;
  88. }
  89.  
  90. //返回字符串中的所有单词 $distinct=true 去除重复
  91. function split_en_str($str,$distinct=true) {
  92.         preg_match_all('/([a-zA-Z]+)/',$str,$match);
  93.         if ($distinct == true) {
  94.                 $match[1] = array_unique($match[1]);
  95.         }
  96.         sort($match[1]);
  97.         return $match[1];
  98. }
  99. ?>
Tags : , ,   阅读次数: 24

One Response to “PHP采集程序中常用的函数”

  1. wow gold

    We have been an ebay power seller and paypal confirmed seller of wow gold for years.

Leave a Reply

You must be logged in to post a comment.