一些采集程序中常用的函数整理,强烈推荐.绝对没有坑爹.有木有 //获得当前的脚本网址 function get_php_url(){ if(!emptyempty($_SERVER[REQUEST_URI])){ $scriptName = $_SERVER[REQUEST_URI]; $nowurl = $scriptName; }else{ $scriptName = $_SERVER[PHP_SELF]; if(em
		
		
		
		   
一些采集程序中常用的函数整理,强烈推荐.绝对没有坑爹.有木有 
 
//获得当前的脚本网址   
function get_php_url(){   
        if(!emptyempty($_SERVER["REQUEST_URI"])){   
                $scriptName = $_SERVER["REQUEST_URI"];   
                $nowurl = $scriptName;   
        }else{   
                $scriptName = $_SERVER["PHP_SELF"];   
                if(emptyempty($_SERVER["QUERY_STRING"])) $nowurl = $scriptName;   
                else $nowurl = $scriptName."?".$_SERVER["QUERY_STRING"];     
        }   
        return $nowurl;   
}   
//把全角数字转为半角数字   
function GetAlabNum($fnum){   
        $nums = array("0","1","2","3","4","5","6","7","8","9");   
        $fnums = "0123456789";   
        for($i=0;$i<=9;$i++) $fnum = str_replace($nums[$i],$fnums[$i],$fnum);   
        $fnum = ereg_replace("[^0-9\.]|^0{1,}","",$fnum);   
        if($fnum=="") $fnum=0;   
        return $fnum;   
}   
//去除HTML标记    
 function Text2Html($txt){   
        $txt = str_replace("  "," ",$txt);   
        $txt = str_replace("<","<",$txt);   
        $txt = str_replace(">",">",$txt);   
        $txt = preg_replace("/[\r\n]{1,}/isU","<br/>\r\n",$txt);   
        return $txt;   
}   
//清除HTML标记   
function ClearHtml($str){   
        $str = str_replace('<','<',$str);   
        $str = str_replace('>','>',$str);   
        return $str;   
}   
//相对路径转化成绝对路径    
 function relative_to_absolute($content, $feed_url) {   
    preg_match('/(http|https|ftp):\/\//', $feed_url, $protocol);   
    $server_url = preg_replace("/(http|https|ftp|news):\/\//", "", $feed_url);   
    $server_url = preg_replace("/\/.*/", "", $server_url);   
    if ($server_url == '') {   
        return $content;   
    }   
    if (isset($protocol[0])) {   
        $new_content = preg_replace('/href="\//', 'href="'.$protocol[0].$server_url.'/', $content);   
        $new_content = preg_replace('/src="\//', 'src="'.$protocol[0].$server_url.'/', $new_content);   
    } else {    
         $new_content = $content;   
    }   
    return $new_content;   
}   
//取得所有链接   
function get_all_url($code){   
        preg_match_all('/<a\s+href=["|\']?([^>"\' ]+)["|\']?\s*[^>]*>([^>]+)<\/a>/i',$code,$arr);   
        return array('name'=>$arr[2],'url'=>$arr[1]);   
}   
//获取指定标记中的内容   
function get_tag_data($str, $start, $end){   
        if ( $start == '' || $end == '' ){   
               return;   
        }   
        $str = explode($start, $str);    
         $str = explode($end, $str[1]);   
        return $str[0];   
}   
//HTML表格的每行转为CSV格式数组   
function get_tr_array($table) {   
        $table = preg_replace("'<td[^>]*?>'si",'"',$table);   
        $table = str_replace("</td>",'",',$table);   
        $table = str_replace("</tr>","{tr}",$table);   
        //去掉 HTML 标记   
        $table = preg_replace("'<[\/\!]*?[^<>]*?>'si","",$table);   
        //去掉空白字符    
        $table = preg_replace("'([\r\n])[\s]+'","",$table);    
         $table = str_replace(" ","",$table);   
        $table = str_replace(" ","",$table);   
        $table = explode(",{tr}",$table);   
        array_pop($table);   
        return $table;   
}   
//将HTML表格的每行每列转为数组,采集表格数据   
function get_td_array($table) {   
        $table = preg_replace("'<table[^>]*?>'si","",$table);   
        $table = preg_replace("'<tr[^>]*?>'si","",$table);   
        $table = preg_replace("'<td[^>]*?>'si","",$table);    
         $table = str_replace("</tr>","{tr}",$table);   
        $table = str_replace("</td>","{td}",$table);   
        //去掉 HTML 标记   
        $table = preg_replace("'<[\/\!]*?[^<>]*?>'si","",$table);   
        //去掉空白字符    
        $table = preg_replace("'([\r\n])[\s]+'","",$table);   
        $table = str_replace(" ","",$table);   
        $table = str_replace(" ","",$table);             
        $table = explode('{tr}', $table);    
         array_pop($table);   
        foreach ($table as $key=>$tr) {   
                $td = explode('{td}', $tr);   
                array_pop($td);   
            $td_array[] = $td;   
        }   
        return $td_array;   
}   
//返回字符串中的所有单词 $distinct=true 去除重复   
function split_en_str($str,$distinct=true) {   
        preg_match_all('/([a-zA-Z]+)/',$str,$match);   
        if ($distinct == true) {    
                 $match[1] = array_unique($match[1]);   
        }   
        sort($match[1]);   
        return $match[1];   
}   
 
  
 延伸阅读:dede自带采集器的高阶技巧dedecms采集过滤DeDeCMS采集的问题与技巧集锦!火车采集器使用教程phpcms新闻采集( 示例 )PHP采集程序中常用的函数PHP采集之CURL函数库PHP采集程序的思路