输入地址聂卫平,我想看看中国棋院棋手到底有哪些特点,看到了下图,决定写个爬虫,一下子下载下来
这里并不是静态加载的,而是像后台请求的。这里使用了lambda传参
import scrapy
import json
from pyquery import PyQuery as pq
from life_example.items import PersonBaiKeItem
from life_example.utils.util import get_uuid,remove_js_css
'''
围棋中国棋院
'''
class WeiQiZgqySpider(scrapy.Spider):
name = "zgqy"
start_urls = [
"https://baike.baidu.com/guanxi/jsondata?action=getViewLemmaData&args=%5B0%2C8%2C%7B%22fentryTableId%22%3A18311%2C%22lemmaId%22%3A9598014%2C%22subLemmaId%22%3A9598014%7D%2Cfalse%5D",
]
def parse(self, response):
html = json.loads(response.body_as_unicode())
soup = pq(html['html'])
# 九段
d9 = soup('div.relation-unit').eq(0)
links = pq(d9)('a')
for link in links:
lo = pq(link)
if lo.attr('title') =='待创建':
continue
url = lo.attr('href')
name = lo.text()
occupation ='中国棋院九段'
yield scrapy.Request(url, callback=lambda response,name=name,occupation=occupation:self.parse_page(response, name,occupation))
# 八段
d8 = soup('div.relation-unit').eq(1)
links = pq(d8)('a')
for link in links:
lo = pq(link)
if lo.attr('title') =='待创建':
continue
url = lo.attr('href')
name = lo.text()
occupation ='中国棋院八段'
yield scrapy.Request(url, callback=lambda response,name=name,occupation=occupation:self.parse_page(response, name,occupation))
# 七段
d7 = soup('div.relation-unit').eq(2)
links = pq(d7)('a')
for link in links:
lo = pq(link)
if lo.attr('title') =='待创建':
continue
url = lo.attr('href')
name = lo.text()
occupation ='中国棋院七段'
yield scrapy.Request(url, callback=lambda response,name=name,occupation=occupation:self.parse_page(response, name,occupation))
# 六段
d6 = soup('div.relation-unit').eq(3)
links = pq(d6)('a')
for link in links:
lo = pq(link)
if lo.attr('title') =='待创建':
continue
url = lo.attr('href')
name = lo.text()
occupation ='中国棋院六段'
yield scrapy.Request(url, callback=lambda response,name=name,occupation=occupation:self.parse_page(response, name,occupation))
# 五段
d5 = soup('div.relation-unit').eq(4)
links = pq(d5)('a')
for link in links:
lo = pq(link)
if lo.attr('title') =='待创建':
continue
url = lo.attr('href')
name = lo.text()
occupation ='中国棋院五段'
yield scrapy.Request(url, callback=lambda response,name=name,occupation=occupation:self.parse_page(response, name,occupation))
# 四段
d4 = soup('div.relation-unit').eq(5)
links = pq(d4)('a')
for link in links:
lo = pq(link)
if lo.attr('title') =='待创建':
continue
url = lo.attr('href')
name = lo.text()
occupation ='中国棋院四段'
yield scrapy.Request(url, callback=lambda response,name=name,occupation=occupation:self.parse_page(response, name,occupation))
# 三段
d3 = soup('div.relation-unit').eq(6)
links = pq(d3)('a')
for link in links:
lo = pq(link)
if lo.attr('title') =='待创建':
continue
url = lo.attr('href')
name = lo.text()
occupation ='中国棋院三段'
yield scrapy.Request(url, callback=lambda response,name=name,occupation=occupation:self.parse_page(response, name,occupation))
# 二段
d2 = soup('div.relation-unit').eq(7)
links = pq(d2)('a')
for link in links:
lo = pq(link)
if lo.attr('title') =='待创建':
continue
url = lo.attr('href')
name = lo.text()
occupation ='中国棋院二段'
yield scrapy.Request(url, callback=lambda response,name=name,occupation=occupation:self.parse_page(response, name,occupation))
# 一段
d1 = soup('div.relation-unit').eq(8)
links = pq(d1)('a')
for link in links:
lo = pq(link)
if lo.attr('title') =='待创建':
continue
url = lo.attr('href')
name = lo.text()
occupation ='中国棋院一段'
yield scrapy.Request(url, callback=lambda response,name=name,occupation=occupation:self.parse_page(response, name,occupation))
def parse_page(self,response,name,occupation):
print(name,occupation)
item = PersonBaiKeItem()
item['id'] = get_uuid()
item['name']=name
item['url']=response.url
item['category']='围棋'
item['detail']=remove_js_css(response.body_as_unicode())
item['occupation']=occupation
yield item
去除js、css等无用信息,保存内容
def remove_js_css(content):
"""
#删除web中的head,jss,注释、Css和空行等标签
"""
r = re.compile(r'''<script.*?</script>''', re.I | re.M | re.S)
sc = r.sub('', content)
r = re.compile(r'''<style.*?</style>''', re.I | re.M | re.S)
sc = r.sub('', sc)
r = re.compile(r'''<!--.*?-->''', re.I | re.M | re.S)
sc = r.sub('', sc)
r = re.compile(r'''<meta.*?>''', re.I | re.M | re.S)
sc = r.sub('', sc)
# r = re.compile(r'''<a.*?</a>''', re.I | re.M | re.S)
# sc = r.sub('', sc)
r = re.compile(r'''<ins.*?</ins>''', re.I | re.M | re.S)
sc = r.sub('', sc)
r = re.compile(r'''^\s+$''', re.M | re.S)
sc = r.sub('', sc)
r = re.compile(r'''\n+''', re.M | re.S)
sc = r.sub('\n', sc)
return sc
url作为参数传递,如果是中文,则会有问题。添加url=re.sub(r'%(?!%)', '%%', url)
,解决此问题。
def is_exist_baidu_person(self,url):
url=re.sub(r'%(?!%)', '%%', url)
sql = "select * from baidu_person where url='{}'".format(url)
df = pd.read_sql(sql,self.baidu_engine)
results = json.loads(df.to_json(orient='records'))
if len(results) == 0:
return False
return True
# return False
def save_baidu_person(self,data):
if not self.is_exist_baidu_person(data['url']):
# 将网页内容保存到文件中
name = data['name']+data['id'][0:2]
file_name = self.save_file(name,data['detail'])
data['detail']=file_name
# 将文件路径写入到数据库中
df = pd.DataFrame([data])
df.to_sql('baidu_person',self.baidu_engine,if_exists='append',index=False)
def save_file(self,name,detail):
ROOT_PATH = '/Users/dzm/Documents/baidu/weiqi_hg'
file_name = os.path.join(ROOT_PATH,name)
file = open(file_name,'w')
file.writelines(detail)
file.close()
print('文件保存成功')
return file_name