使用scrapy框架+selenium自动化去爬取今日头条的内容

最新推荐文章于 2024-07-27 22:38:21 发布

weixin_44435245

最新推荐文章于 2024-07-27 22:38:21 发布

阅读量936

点赞数 1

分类专栏：爬虫文章标签： python xpath

本文链接：https://blog.csdn.net/weixin_44435245/article/details/114370573

版权

爬虫专栏收录该内容

1 篇文章 0 订阅

订阅专栏

使用scrapy框架+selenium自动化去爬取今日头条的内容

第一次写博客，结构可能会有点混乱。使用scrapy框架也是我玩爬虫的第一个实战项目

当然，我们为了让用户更加便捷，我们增加了图片拖拽功能。

spider段代码

其中值得注意的点有两个，1个是path里的chrome驱动路径设置为自己的chrome驱动路径，第2个是if循环中的判断条件self.num==5,这个是控制爬取文章列表那个浏览器的下滑阈值，有些时候报出list index out of range则需要下调阈值，这个是具体情况具体分析的，可大可小。驱动程序安装可参考这篇博客https://blog.csdn.net/n123456uo/article/details/91412740。以下为toutiao.py

import scrapy
from selenium import webdriver
from toutiaopro.items import ToutiaoproItem
from time import sleep
from scrapy.http import HtmlResponse

class ToutiaoSpider(scrapy.Spider):
    name = 'toutiao'
    data = input("请输入要爬取的关键字:")
    number = int(input("请输入要爬取的数量:"))  # 控制爬取数量
    address = 'https://www.toutiao.com/search/?keyword='+data
    start_urls = [address]
    urls = []
    num = 0 #控制浏览器下滑循环次数
    index = 0 #控制收集连接条数


    #初始化浏览器
    def __init__(self):
        path = r'H:\PythonCode\Spider\scrapy\wangyi\wangyi\spiders\chromedriver.exe'
        self.bro1 = webdriver.Chrome(executable_path=path)
        self.bro2 = webdriver.Chrome(executable_path=path)


    #获取到关键字的文章列表
    def parse(self, response):
        #获取到列表属性
        div_list = response.xpath('/html/body/div/div[4]/div[2]/div[3]/div/div/div')

        ########
        #获取每篇文章
        # for div in div_list:
        #     url_temp = div.xpath('./div/div/div/div/div//@href').extract_first()
        #     url = 'https://www.toutiao.com/a'+url_temp.split('/',3)[2]
        #     self.urls.append(url)
        # for href in self.urls:
        #     yield scrapy.Request(href,callback=self.parse_model)
        ##############

        for div in div_list:
            url_temp = div.xpath('./div/div/div/div/div//@href').extract_first()
            #链接拼接
            url = 'https://www.toutiao.com/a'+url_temp.split('/',3)[2]
            self.urls.append(url)
            print("--------")
            print(url)
            print("---------")

        #控制爬取数量
        while self.index<=self.number:
            # for href in self.urls:
            #     yield scrapy.Request(href, callback=self.parse_model)
            index = self.index
            yield scrapy.Request(self.urls[index], callback=self.parse_model)

            #获取10篇文章后刷新文章列表
            #有些关键字一页若没有这个数，则需调低阈值
            if self.num == 5:
                #滑动滚动条
                self.bro1.execute_script('window.scrollTo(0, document.body.scrollHeight)')
                sleep(5)
                page_text = self.bro1.page_source
                print("if里面")
                #print(page_text)

                new_response = HtmlResponse(url='',body=page_text,encoding='utf-8', request='')
                # self.artical_list(page_text)
                self.artical_list(new_response)
                self.num = 0
                self.index = self.index + 1
                print("if中的index", self.index)
            else:
                print("else里面",self.num)
                self.index = self.index + 1
                print("else中的index",self.index)
                self.num = self.num + 1
    #文章解析
    def parse_model(self,response):

        title = response.xpath('//*[@id="root"]/div/div[2]/div[1]/div[2]/h1/text()').extract_first()
        content = response.xpath('//*[@id="root"]/div/div[2]/div[1]/div[2]/article//text()').extract()
        content = ''.join(content)
        span = response.xpath('//*[@id="root"]/div/div[2]/div[1]/div[2]/div[1]/span')
        if len(span) == 2:
            author = response.xpath('//*[@id="root"]/div/div[2]/div[1]/div[2]/div[1]/span[1]/text()').extract_first()
            time = response.xpath('//*[@id="root"]/div/div[2]/div[1]/div[2]/div[1]/span[2]/text()').extract_first()
        else:
            author = response.xpath('//*[@id="root"]/div/div[2]/div[1]/div[2]/div[1]/span[2]/text()').extract_first()
            time = response.xpath('//*[@id="root"]/div/div[2]/div[1]/div[2]/div[1]/span[3]/text()').extract_first()

        #提交管道
        #self.bro1.excute_script('window.scrollTo(0, document.body.scrollHeight)')
        item = ToutiaoproItem()
        item['title'] = title
        item['content'] = content
        item['time'] = time
        item['author'] = author

        yield item

    #列表解析
    def artical_list(self,new_response):
        # 获取到列表属性
        div_list = new_response.xpath('/html/body/div/div[4]/div[2]/div[3]/div/div/div')
        num_urls = len(self.urls)
        num_div = len(div_list)
        for div in range(num_urls,num_div):
            href_temp = div_list[div].xpath('./div/div/div/div/div//@href').extract_first()
            href_temp = 'https://www.toutiao.com/a'+href_temp.split('/',3)[2]
            self.urls.append(href_temp)
            print("!!!!!!!!!!!!!")
            print(href_temp)
            print("!!!!!!!!!!!!")

中间件middewares

其中PROXY_https列表中可加入自己的ip代理池，我写的都是不可用的。加入代理后将process_request注释关掉，和process_exception注释关掉即可使用代理池。

# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals

# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
from scrapy.http import HtmlResponse
from time import sleep
import random

class ToutiaoproDownloaderMiddleware:

    #代理池
    PROXY_https = [
        '120.83.49.90:9000',
        '95.189.112.214:35508',

    ]
    def process_request(self, request, spider):

        # ip = random.choice(self.PROXY_https)
        # request.meta['proxy'] = 'https://' + ip
        return None

    def process_response(self, request, response, spider):
        bro1 = spider.bro1 #文章列表浏览器
        bro2 = spider.bro2 #具体文章浏览器
        #请求地址在文章列表里
        if request.url in spider.urls:
            bro2.get(request.url)
            sleep(2)
            page_text = bro2.page_source
            new_response = HtmlResponse(url=request.url, body=page_text, encoding='utf-8', request=request)
            return new_response
        else:
            bro1.get(request.url)
            sleep(2)
            page_text = bro1.page_source

            response = HtmlResponse(url=request.url, body=page_text, encoding='utf-8', request=request)

            return response

    def process_exception(self, request, exception, spider):
        #代理异常
        # ip = random.choice(self.PROXY_https)
        # request.meta['proxy'] = 'https://' + ip
        #
        # return request
        pass

管道pipelines

我这是将数据存放在mysql，只需要在代码中补充自己的mysql相关信息即可。注意的是，需要创建一个python数据库，还有toutiao表，字段为title，content，time，author。content为text类型，其他是varchar类型。

import pymysql

class ToutiaoproPipeline:
    def process_item(self, item, spider):
        tiele = item['title']
        #print(tiele)
        return item

class mysqlPipeLine(object):
    conn = None
    cursor = None
    def open_spider(self,spider):
        self.conn = pymysql.Connect(host='自己MYSQL的地址',port=3306,user='MYSQL用户名',password='MYSQL密码',db='python',charset='utf8')
    def process_item(self,item,spider):
        self.cursor = self.conn.cursor()
        try:
            tiele = item['title']
            print(tiele)
            self.cursor.execute('insert into toutiao (title,content,time,author) values("%s","%s","%s","%s")'%(item["title"],item["content"],item["time"],item["author"]))
            self.conn.commit()
        except Exception as e:
            print(e)
            self.conn.rollback()
        return item
    def close_spider(self,spider):
        self.cursor.close()
        self.conn.close()