八字命令做了不少改造,最后发现还有需要大量的案例,有些条文总是太模糊,而没有权威解释,你很难相信他们所有的信手拈来的“想象力”。
1 scrapy_redis
Scrapy-redis分布式+Scrapy-redis实战
pip install scrapy_redis -i https://pypi.tuna.tsinghua.edu.cn/simple
使用scrapy_redis,获取百度人物的链接,代码很简单,就是从文件中按行读取名称,写入到redis中
# -*- coding: utf-8 -*-
# @time : 2021/8/8 19:25
# @author : dzm
# @dsec :
import redis
def read_file(conn,file_name):
with open(file_name,encoding='utf-8' ) as lines:
for line in lines:
write_redis(conn, line)
def write_redis(conn, value):
conn.rpush('bp:sport',value)
def get_conn():
r = redis.Redis(host='127.0.0.1',port=6379);
return r
if __name__ == '__main__':
conn = get_conn()
read_file(conn, r'../files/bd_sport')
print('写入人物完成')
2 从百度人物链接解析出需要的内容
集成RedisSpider,指定redis的key即可
# -*- coding: utf-8 -*-
# @time : 2021/8/8 12:00
# @author : dzm
# @dsec :
import re
import scrapy
from scrapy_redis.spiders import RedisSpider
from pyquery import PyQuery as pq
from life_example.items import QqPersonItem
import datetime
class Bp1Spdier(RedisSpider):
name = "bp1"
allowed_domains = ['baidu.com']
redis_key = 'bp:sport'
def make_requests_from_url(self, url):
url = 'https://baike.baidu.com/item/{}'.format(url)
print(url)
return scrapy.Request(url=url, method='GET',dont_filter=True, callback=self.parse_content)
def parse_content(self, response):
print('进入到bp1')
soup = pq(response.body_as_unicode())
# 基本信息
item = QqPersonItem()
dt1s = soup('.basic-info dl:eq(0) dt')
for i in range(dt1s.size()):
dt = dt1s[i].text
dd = soup('.basic-info dl:eq(0) dd:eq({})'.format(i)).text()
self.get_sport_info(dt, dd, item)
dt2s = soup('.basic-info dl:eq(1) dt')
for i in range(dt2s.size()):
dt_val = dt1s[i].text
dd_val = soup('.basic-info dl:eq(1) dd:eq({})').format(i).text()
self.get_sport_info(dt, dd, item)
yield item
def get_sport_info(self,dt, dd, item):
# 去空格
dt = re.sub('\s+',' ',dt).strip()
if (dt == '中文名'):
item['name'] = dd
elif (dt == '国籍'):
item['nation'] = dd
elif (dt == '出生地'):
item['birthplace'] = dd
elif (dt == '出生日期'):
item['birthday'] = self.get_birthday(dd)
elif (dt == '运动项目'):
item['occupation'] = dd
elif (dt == '主要奖项' or dt == '主要成就'):
item['desc'] = dd
def get_birthday(self, date):
if re.match(r'\d{4}-\d{1,2}-\d{1,2}日?',date):
solar_birthday = datetime.datetime.strptime(date,'%Y-%m-%d')
elif re.match(r'\d{4}\.\d{1,2}\.\d{1,2}日?',date):
solar_birthday = datetime.datetime.strptime(date,'%Y.%m.%d')
elif re.match(r'\d{4}年\d{1,2}月\d{1,2}日?',date):
sb = date.split(" ")[0]
if '日' in sb:
solar_birthday = datetime.datetime.strptime(sb,'%Y年%m月%d日')
else:
solar_birthday = datetime.datetime.strptime(sb+'日','%Y年%m月%d日')
return solar_birthday
if __name__ == '__main__':
pass