要求
获取第七史诗wiki站中每个小项目中的标题,段落文字或表格内容,以及数据源
的url。保留排版
分析
文字内容:直接请求详情页url后解析获得
表格内容:该站动态生成的表格内容数据来源有两处。一是html网页(script标签),二是通过ajax加载(post请求);后者为难点,需要从详情页构造参数payloads,并且请求到表格数据内容后获取所有表格的链接(可以直接访问,为了减少参数,不选用post请求使用的url)
实现
import scrapy
import json
from scrapy import Request
from scrapy import Selector
import re
import pandas as pd
from scrapy import Request, signals
import requests
class CrawlSpider(scrapy.Spider):
name = 'crawl'
def __init__(self, **kwargs):
super(CrawlSpider, self).__init__(**kwargs)
self.data = list()
def start_requests(self):
# 列表页 wiki站
url = "http://epic7.gamekee.com/"
yield Request(url=url, callback=self.get_list_t)
# dont_filter=True 必须加
yield Request(url=url, callback=self.get_list_p, dont_filter=True)
# 分析的段落内容
def get_list_p(self, response):
res = Selector(response)
lst = list()
for a in res.xpath(r'//div[contains(@class,"func-btn-wrapper")][not(@data-id="193")]/div/a'):
href = a.xpath(r'./@href').get()
print(href)
try:
id = re.findall(re.compile(r'[\d]+'), href)
except Exception as e:
print(e)
lst.append(id[0])
print(f"段落id{lst}")
for id in lst:
print("id" + id)
# 详情页
url = f"http://epic7.gamekee.com/{id}.html"
yield Request(url=url, callback=self.get_p, cb_kwargs={'id': id})
def get_p(self, response, **kwargs):
res = Selector(response)
title = res.xpath('//h3[contains(@class,"view-title")]/text()').get().strip()
lst = list()
for p in res.xpath('//div[contains(@class,"article-content")]/*'):
stra = p.xpath(r'string(.)').get()
lst.append(stra)
content = "\n".join(lst)
if content == '':
return
url = f'http://epic7.gamekee.com/{kwargs["id"]}.html'
url = str(url)
dic = {
'title': title,
'url': url,
'content': content
}
self.data.append(dic)
def get_list_t(self,response):
res = Selector(response)
lst = list()
for a in res.xpath(r'//div[contains(@class,"func-btn-wrapper")][@data-id="193"]/div/a'):
href = a.xpath(r'./@href').get()
print(href)
try:
id = re.findall(re.compile(r'[\d]+'), href)
except Exception as e:
print(e)
lst.append(id[0])
print(f"表格id{lst}")
for id in lst:
url=f"http://epic7.gamekee.com/{id}.html"
yield Request(url=url,callback=self.get_table,cb_kwargs={'id':id})
图1
图2
提取 这一变量的值
定位一头一尾分析使用json 解析
图3
def get_table(self,response,**kwargs):
url=f'http://epic7.gamekee.com/{kwargs["id"]}.html'
res=Selector(text=response.text)
title = res.xpath(r"//h3[contains(@class,view-title)]/text()").get().strip()
# 解析get请求中由js渲染的表格
# 获取script标签内文本图1 -->解析
cnt = res.xpath(r'//div[@class="container main-content"]/script[3]/text()').get() # html_id_text.html 图2
rescript = Selector(text=cnt)
p = rescript.xpath(r"//table[contains(@class,'mould-table')]/tbody").get()
# script内没有表格 post请求表格数据
if p is None:
# 构造请求参数 payloads
try:
ids = re.findall(re.compile(r'var\smoduleData\s=\s\[(.*?)];'), cnt) # id_html_json.html 图3
# ids解析
ids = json.loads(ids[0])["data"]
except Exception as e:
return
lstid = list()
for dic in ids:
# 字典部分没有id
try:
lstid.append(str(dic["id"]))
except Exception:
continue
payloads = ",".join(lstid)
# 获取到正确的表格数据接口 #res = self.get_url_tpost(payloads) 不能用该方法调用其他函数
url = "http://epic7.gamekee.com/editor/getmodelbyid.html"
payload = "id=" + payloads
headers = {
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
' Chrome/96.0.4664.93 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
}
response = requests.request("POST", url, headers=headers, data=payload)
res = Selector(text=response.text)
# 获取到正确的表格数据接口
# 获取表格url
# 找到明显标注/contentmoba/代替分析数据索引结构法 两行代码既可以解决
response = res.get()
response = re.findall(re.compile(r'/contentmoba/([\d]+).html'), response)
url = list()
for id in response:
urli = f"http://epic7.gamekee.com/{id}.html"
url.append(urli)
else :
res=rescript
# 解析表格
lst0 = list()
for table in res.xpath(r"//table[contains(@class,'mould-table')]/tbody"):
count = 0
lst1 = list()
for tr in table.xpath("./tr"):
lst2 = list()
for td in tr.xpath("./td"):
print(td.xpath('string(.)').get(), end="\t")
if count == 0:
count = count + 1
lst2.append(td.xpath('string(.)').get() + '\n')
else:
lst2.append(td.xpath('string(.)').get())
tr = "\t".join(lst2)
lst1.append(tr)
print(end="\n")
t = "\n".join(lst1)
if t is not None:
lst0.append(t)
content = "\n\n".join(lst0)
if content=='':
return
url=str(url)
dic = {
"title": title,
"url": url,
"content": content
}
print(dic)
self.data.append(dic)
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = cls(**kwargs)
spider._set_crawler(crawler)
crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
return spider
def spider_closed(self, spider):
#print(f'self.data{self.data}')
print(f'一共 {len(self.data)} 条数据')
self.logger.info(f'一共 {len(self.data)} 条数据')
output = '游戏数据.xlsx'
with pd.ExcelWriter(output) as writer:
df_new = pd.DataFrame(self.data)
df_new.to_excel(writer, sheet_name='data', index=False)
# df_new = pd.DataFrame(self.data)
# df_new.to_csv(self.data)
spider.logger.info('生成报表成功: %s', output)
return len(self.data)
说明:72700 895两个参数的来源
network面板里的请求链接
细节
单引号和双引号
html 和 htmlElement对象之间的转换
eval null 无法转换的问题
global null
null = ''
content=eval(content)