第4.4章 scrapy爬虫lambda传参

最新推荐文章于 2022-07-06 17:52:49 发布

warrah

最新推荐文章于 2022-07-06 17:52:49 发布

阅读量291

点赞数

分类专栏：岁月云——python 文章标签： scrapy

本文链接：https://blog.csdn.net/warrah/article/details/84555966

版权

岁月云——python 专栏收录该内容

66 篇文章 0 订阅

订阅专栏

输入地址聂卫平，我想看看中国棋院棋手到底有哪些特点，看到了下图，决定写个爬虫，一下子下载下来

这里并不是静态加载的，而是像后台请求的。这里使用了lambda传参

import scrapy
import json
from pyquery import PyQuery as pq
from life_example.items import PersonBaiKeItem
from life_example.utils.util import get_uuid,remove_js_css

'''
围棋中国棋院
'''
class WeiQiZgqySpider(scrapy.Spider):
    name = "zgqy"
    start_urls = [
        "https://baike.baidu.com/guanxi/jsondata?action=getViewLemmaData&args=%5B0%2C8%2C%7B%22fentryTableId%22%3A18311%2C%22lemmaId%22%3A9598014%2C%22subLemmaId%22%3A9598014%7D%2Cfalse%5D",
    ]

    def parse(self, response):
        html = json.loads(response.body_as_unicode())
        soup = pq(html['html'])
        # 九段
        d9 = soup('div.relation-unit').eq(0)
        links = pq(d9)('a')
        for link in links:
            lo = pq(link)
            if lo.attr('title') =='待创建':
                continue
            url = lo.attr('href')
            name = lo.text()
            occupation ='中国棋院九段'
            yield scrapy.Request(url, callback=lambda response,name=name,occupation=occupation:self.parse_page(response, name,occupation))
        # 八段
        d8 = soup('div.relation-unit').eq(1)
        links = pq(d8)('a')
        for link in links:
            lo = pq(link)
            if lo.attr('title') =='待创建':
                continue
            url = lo.attr('href')
            name = lo.text()
            occupation ='中国棋院八段'
            yield scrapy.Request(url, callback=lambda response,name=name,occupation=occupation:self.parse_page(response, name,occupation))
        # 七段
        d7 = soup('div.relation-unit').eq(2)
        links = pq(d7)('a')
        for link in links:
            lo = pq(link)
            if lo.attr('title') =='待创建':
                continue
            url = lo.attr('href')
            name = lo.text()
            occupation ='中国棋院七段'
            yield scrapy.Request(url, callback=lambda response,name=name,occupation=occupation:self.parse_page(response, name,occupation))
        # 六段
        d6 = soup('div.relation-unit').eq(3)
        links = pq(d6)('a')
        for link in links:
            lo = pq(link)
            if lo.attr('title') =='待创建':
                continue
            url = lo.attr('href')
            name = lo.text()
            occupation ='中国棋院六段'
            yield scrapy.Request(url, callback=lambda response,name=name,occupation=occupation:self.parse_page(response, name,occupation))
        # 五段
        d5 = soup('div.relation-unit').eq(4)
        links = pq(d5)('a')
        for link in links:
            lo = pq(link)
            if lo.attr('title') =='待创建':
                continue
            url = lo.attr('href')
            name = lo.text()
            occupation ='中国棋院五段'
            yield scrapy.Request(url, callback=lambda response,name=name,occupation=occupation:self.parse_page(response, name,occupation))
        # 四段
        d4 = soup('div.relation-unit').eq(5)
        links = pq(d4)('a')
        for link in links:
            lo = pq(link)
            if lo.attr('title') =='待创建':
                continue
            url = lo.attr('href')
            name = lo.text()
            occupation ='中国棋院四段'
            yield scrapy.Request(url, callback=lambda response,name=name,occupation=occupation:self.parse_page(response, name,occupation))
        # 三段
        d3 = soup('div.relation-unit').eq(6)
        links = pq(d3)('a')
        for link in links:
            lo = pq(link)
            if lo.attr('title') =='待创建':
                continue
            url = lo.attr('href')
            name = lo.text()
            occupation ='中国棋院三段'
            yield scrapy.Request(url, callback=lambda response,name=name,occupation=occupation:self.parse_page(response, name,occupation))
        # 二段
        d2 = soup('div.relation-unit').eq(7)
        links = pq(d2)('a')
        for link in links:
            lo = pq(link)
            if lo.attr('title') =='待创建':
                continue
            url = lo.attr('href')
            name = lo.text()
            occupation ='中国棋院二段'
            yield scrapy.Request(url, callback=lambda response,name=name,occupation=occupation:self.parse_page(response, name,occupation))
        # 一段
        d1 = soup('div.relation-unit').eq(8)
        links = pq(d1)('a')
        for link in links:
            lo = pq(link)
            if lo.attr('title') =='待创建':
                continue
            url = lo.attr('href')
            name = lo.text()
            occupation ='中国棋院一段'
            yield scrapy.Request(url, callback=lambda response,name=name,occupation=occupation:self.parse_page(response, name,occupation))

    def parse_page(self,response,name,occupation):
        print(name,occupation)
        item = PersonBaiKeItem()
        item['id'] = get_uuid()
        item['name']=name
        item['url']=response.url
        item['category']='围棋'
        item['detail']=remove_js_css(response.body_as_unicode())
        item['occupation']=occupation
        yield item

去除js、css等无用信息，保存内容

def remove_js_css(content):
    """
    #删除web中的head,jss,注释、Css和空行等标签
    """
    r = re.compile(r'''<script.*?</script>''', re.I | re.M | re.S)
    sc = r.sub('', content)
    r = re.compile(r'''<style.*?</style>''', re.I | re.M | re.S)
    sc = r.sub('', sc)
    r = re.compile(r'''<!--.*?-->''', re.I | re.M | re.S)
    sc = r.sub('', sc)
    r = re.compile(r'''<meta.*?>''', re.I | re.M | re.S)
    sc = r.sub('', sc)
    # r = re.compile(r'''<a.*?</a>''', re.I | re.M | re.S)
    # sc = r.sub('', sc)
    r = re.compile(r'''<ins.*?</ins>''', re.I | re.M | re.S)
    sc = r.sub('', sc)
    r = re.compile(r'''^\s+$''', re.M | re.S)
    sc = r.sub('', sc)
    r = re.compile(r'''\n+''', re.M | re.S)
    sc = r.sub('\n', sc)
    return sc

url作为参数传递，如果是中文，则会有问题。添加url=re.sub(r'%(?!%)', '%%', url),解决此问题。

def is_exist_baidu_person(self,url):
        url=re.sub(r'%(?!%)', '%%', url)
        sql = "select * from baidu_person where url='{}'".format(url)
        df = pd.read_sql(sql,self.baidu_engine)
        results = json.loads(df.to_json(orient='records'))
        if len(results) == 0:
            return False
        return True
        # return False

    def save_baidu_person(self,data):
        if not self.is_exist_baidu_person(data['url']):
        #     将网页内容保存到文件中
            name = data['name']+data['id'][0:2]
            file_name = self.save_file(name,data['detail'])
            data['detail']=file_name
            # 将文件路径写入到数据库中
            df = pd.DataFrame([data])
            df.to_sql('baidu_person',self.baidu_engine,if_exists='append',index=False)

    def save_file(self,name,detail):
        ROOT_PATH = '/Users/dzm/Documents/baidu/weiqi_hg'
        file_name = os.path.join(ROOT_PATH,name)
        file = open(file_name,'w')
        file.writelines(detail)
        file.close()
        print('文件保存成功')
        return file_name