<?php
$all_wiki_urls = findAllWikiUrl();
$file = "";
$i = 1;
echo "总共: " .count($all_wiki_urls)."\n";
foreach ($all_wiki_urls as $title => $url){
$text = matchMwParserOutput($title, $url);
if ($text !== false){
$file .= "$title\t$text\r\n";
echo "词条 $title 转换完成, 当前位置: $i\n";
$i++;
}
}
file_put_contents("words_card1.txt", $file);
function matchMwParserOutput($title, $wiki_url)
{
$content = file_get_contents($wiki_url);
if (empty($content)){
return "";
}
preg_match_all("/<div class=\"mw-parser-output\">[\s\S]*?<\/table>/", $content, $match);
if (empty($match[0][0])) {
return false;
}
$match_text = $match[0][0];
$strip_text = strip_tags($match_text);
$patten = array("\r\n", "\n", "\r", "卡牌总览", "遗物收集", "药水小样", "事件日志", "敌人图鉴", " →点此查看数据页面");
$final_string = str_replace($patten, "\t", $strip_text);
return $final_string;
}
function findAllWikiUrl(){
$base_url = "https://sts.huijiwiki.com/";
$url = "https://sts.huijiwiki.com/wiki/%E4%B8%8A%E5%8B%BE%E6%8B%B3";
$page_content = file_get_contents($url);
preg_match_all("/<a href=\"(\/wiki\/.*?)\" title=\"(.*?)\">/", $page_content, $matches);
$wikiArray = [];
$length = count($matches[0]);
for ($i = 0 ; $i < $length; $i++){
if (strpos($matches[1][$i], "Icon_Rage.png") != false){
continue;
}
if (strpos($matches[1][$i], "class") == false){
$wikiArray[$matches[2][$i]] = $base_url. $matches[1][$i];
}
}
return $wikiArray;
}
php 文本爬取
最新推荐文章于 2021-12-09 11:54:09 发布