scrapy框架爬取搜狐网

最新推荐文章于 2021-01-17 07:07:10 发布

xiaoxiao_chen945

最新推荐文章于 2021-01-17 07:07:10 发布

阅读量390

点赞数

本文链接：https://blog.csdn.net/xiaoxiao_chen945/article/details/111360994

版权

一、items.py：设定你爬取数据存储的字段名称。

# -*- coding: utf-8 -*-
 
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
 
import scrapy
 
class TutorialItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    URL = scrapy.Field()   #存放当前网页地址
    TITLE = scrapy.Field() #存放当前网页title
    H1 = scrapy.Field() #存放一级标题
    TEXT = scrapy.Field() #存放正文

二、pipelines.py：把你抓取后的数据，存到文件里面的逻辑

# -*- coding: utf-8 -*-
 
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import json
 
class TutorialPipeline(object):
   
    def __init__(self):
        self.filename = open("content.txt",'w',encoding="utf-8")
        self.contain = set()  #定义集合用于去重
   
    def process_item(self, item, spider):
        text = json.dumps(dict(item),ensure_ascii=False) + '\n'
        text_dict = eval(text)
        if text_dict['URL'] not in self.contain: #抓取到新网页，并写入文件
            for _,targetName in text_dict.items():
                if "人" in targetName:
                    self.write_to_txt(text_dict)
                    break  #避免重复写入
            self.contain.add(text_dict['URL']) #每次记录文件后把网页url写入集合，重复的url会自动过滤掉
        return item
   
    def close_spider(self,spider):
        self.filename.close()
   
    def write_to_txt(self,text_dict):
        #把抓取到的内容写入文件中
        for key,value in text_dict.items():
            self.filename.write(key+"内容:\n"+value+'\n')
        self.filename.write(50*'='+'\n')

三、spider目录下的sohu.py：爬虫分析(parser的文件)

# -*- coding: utf-8 -*-
import scrapy
import re,os
from tutorial.items import TutorialItem
from scrapy import Request
 
#在文件主目录下执行抓取：#scrapy crawl sohu
 
class SohuSpider(scrapy.Spider):
    name = 'sohu' #项目名称
    # allowed_domains = ['www.sohu.com']  #如果指定爬虫作用范围，则作用于首页之后的页面
    start_urls = ['http://www.sohu.com/']  #开始url
 
    def parse(self, response):
        all_urls = re.findall('href="(.*?)"',response.xpath("/html").extract_first())
        for url in all_urls:
            item = TutorialItem()
            if re.findall("(\.jpg)|(\.jpeg)|(\.gif)|(\.ico)|(\.png)|(\.js)|(\.css)$",url.strip()):
                pass  #去掉无效链接
            elif url.strip().startswith("http") or url.strip().startswith("//"):
                temp_url = url.strip() if url.strip().startswith('http') else 'http:' + url.strip() #三目运算符获取完整网址
                item = self.get_all(item,response)
                #判断item中存在正文且不为空，页面一级标题不为空
                if 'TEXT' in item and item['TEXT'] != '' and item['TITLE'] != '':
                    yield item  #发送到管道
                print('发送<' + temp_url + '>到下载器') #提示
                yield Request(temp_url,callback=self.parse) #递归调用
   
    def get_all(self,item,response):
        #获取当前页面的网址、title、一级标题、正文内容
        item['URL'] = response.url.strip()
        item['TITLE'] = response.xpath('/html/head/title/text()').extract()[0].strip()
        contain_h1 = response.xpath('//h1/text()').extract() #获取当前网页所有一级标题
        contain= contain_h1[0] if len(contain_h1) !=0 else "" #获取第一个一级标题
        item["H1"] = contain.strip()
        main_text = []
        #遍历网页中所有p标签和br标签的内容
        for tag in ['p','br']:
            sub_text = self.get_content(response,tag)
            main_text.extend(sub_text)
        #对正文内容去重并判断不为空
        main_text = list(set(main_text))
        if len(main_text) != 0:
            item['TEXT'] = '\n'.join(main_text)
        return item
   
    def get_content(self,response,tag):
        #判断只有大于100个文字的内容才保留
        main_text = []
        contexts = response.xpath('//'+tag+'/text()').extract()
        for text in contexts:
            if len(text.strip()) > 100:
                main_text.append(text.strip())
        return main_text