最近在做领域词汇识别这个项目,首先就是语料的收集,我没有用蜘蛛来做,自己写了一个简单的分析页面模式来抓取的代码,其实就是正则表达式的使用。
<?
php
require_once ( ' ../common/config.php ' );
require_once ( ' ../common/class.pQuery.php ' );
set_time_limit ( 0 );
date_default_timezone_set ( ' Asia/Shanghai ' );
// $path="E:/php/pw/webpage/results";
$url = ' http://games.qq.com ' ;
$url_js = ' http://gamezone.qq.com/images/frontpage/js/05ntdlwy_list_next.js ' ;
$code = file_get_contents ( $url_js );
preg_match_all ( ' /<a href=(.*?)>/ ' , $code , $macthes ); // 形式如:/ntgame/05ntdlwy_list01.htm
//preg_match_all('#/ntgame[wd.,_/]+#',$code,$macthes); //都可以啦
foreach ( $macthes [ 1 ] as $item ){
$url1 = ' http://games.qq.com ' . $item ;
// $url1 = 'http://games.qq.com/ntgame/05ntdlwy_list01.htm';
$code = file_get_contents ( $url1 );
$pq = new pQuery( ' HTML ' );
$pq -> load_string( $code );
$str1 = $pq -> Query( ' //table[@width="393"] ' ) -> get();
// print_r($str1);
$str2 = implode ( "" , $str1 );
// $substr = getSubstr($code,'<tr><td height="6"></td></tr>','<tr><td height="6"></td></tr></table>');
//echo $str2;
preg_match_all ( ' /<a class="A00" target="_blank" href="(.*?)">/ ' , $str2 , $arr1 );
// preg_match_all('#/a/2008[d]+#',$str2, $arr1);
//print_r($arr1[1]);
$i = 1 ;
foreach ( $arr1 [ 1 ] as $item ){
// $filename=substr($item,12,6);
$filename = date ( " YmdHis " );
$url2 = ' http://games.qq.com ' . $item ; // 得到形式如:http://games.qq.com/a/20080304/000043.htm
//echo "<br>";
//echo $url2.'<br>';
$str3 = file_get_contents ( $url2 );
$substr = getSubstr( $str3 , ' <div id="ArticleTit"> ' , ' [责任编辑 ' );
echo " <br> " ;
// print_r($substr);
//将网页写入文本文件
$f1 = fopen ( " results/{$filename}.txt " , " w " );
if ( fwrite ( $f1 , $substr ) === FALSE ) {
echo " 不能写入到文件 $filename .txt " ;
exit ;
}
echo " 成功写入到文件$filename .txt " ;
fclose ( $f1 );
}
}
?>
require_once ( ' ../common/config.php ' );
require_once ( ' ../common/class.pQuery.php ' );
set_time_limit ( 0 );
date_default_timezone_set ( ' Asia/Shanghai ' );
// $path="E:/php/pw/webpage/results";
$url = ' http://games.qq.com ' ;
$url_js = ' http://gamezone.qq.com/images/frontpage/js/05ntdlwy_list_next.js ' ;
$code = file_get_contents ( $url_js );
preg_match_all ( ' /<a href=(.*?)>/ ' , $code , $macthes ); // 形式如:/ntgame/05ntdlwy_list01.htm
//preg_match_all('#/ntgame[wd.,_/]+#',$code,$macthes); //都可以啦
foreach ( $macthes [ 1 ] as $item ){
$url1 = ' http://games.qq.com ' . $item ;
// $url1 = 'http://games.qq.com/ntgame/05ntdlwy_list01.htm';
$code = file_get_contents ( $url1 );
$pq = new pQuery( ' HTML ' );
$pq -> load_string( $code );
$str1 = $pq -> Query( ' //table[@width="393"] ' ) -> get();
// print_r($str1);
$str2 = implode ( "" , $str1 );
// $substr = getSubstr($code,'<tr><td height="6"></td></tr>','<tr><td height="6"></td></tr></table>');
//echo $str2;
preg_match_all ( ' /<a class="A00" target="_blank" href="(.*?)">/ ' , $str2 , $arr1 );
// preg_match_all('#/a/2008[d]+#',$str2, $arr1);
//print_r($arr1[1]);
$i = 1 ;
foreach ( $arr1 [ 1 ] as $item ){
// $filename=substr($item,12,6);
$filename = date ( " YmdHis " );
$url2 = ' http://games.qq.com ' . $item ; // 得到形式如:http://games.qq.com/a/20080304/000043.htm
//echo "<br>";
//echo $url2.'<br>';
$str3 = file_get_contents ( $url2 );
$substr = getSubstr( $str3 , ' <div id="ArticleTit"> ' , ' [责任编辑 ' );
echo " <br> " ;
// print_r($substr);
//将网页写入文本文件
$f1 = fopen ( " results/{$filename}.txt " , " w " );
if ( fwrite ( $f1 , $substr ) === FALSE ) {
echo " 不能写入到文件 $filename .txt " ;
exit ;
}
echo " 成功写入到文件$filename .txt " ;
fclose ( $f1 );
}
}
?>