使用scrapy框架+selenium自动化去爬取今日头条的内容
第一次写博客,结构可能会有点混乱。使用scrapy框架也是我玩爬虫的第一个实战项目
当然,我们为了让用户更加便捷,我们增加了图片拖拽功能。
spider段代码
其中值得注意的点有两个,1个是path里的chrome驱动路径设置为自己的chrome驱动路径,第2个是if循环中的判断条件self.num==5,这个是控制爬取文章列表那个浏览器的下滑阈值,有些时候报出list index out of range则需要下调阈值,这个是具体情况具体分析的,可大可小。驱动程序安装可参考这篇博客https://blog.csdn.net/n123456uo/article/details/91412740。以下为toutiao.py
import scrapy
from selenium import webdriver
from toutiaopro.items import ToutiaoproItem
from time import sleep
from scrapy.http import HtmlResponse
class ToutiaoSpider(scrapy.Spider):
name = 'toutiao'
data = input("请输入要爬取的关键字:")
number = int(input("请输入要爬取的数量:")) # 控制爬取数量
address = 'https://www.toutiao.com/search/?keyword='+data
start_urls = [address]
urls = []
num = 0 #控制浏览器下滑循环次数
index = 0 #控制收集连接条数
#初始化浏览器
def __init__(self):
path = r'H:\PythonCode\Spider\scrapy\wangyi\wangyi\spiders\chromedriver.exe'
self.bro1 = webdriver.Chrome(executable_path=path)
self.bro2 = webdriver.Chrome(executable_path=path)
#获取到关键字的文章列表
def parse(self, response):
#获取到列表属性
div_list = response.xpath('/html/body/div/div[4]/div[2]/div[3]/div/div/div')
########
#获取每篇文章
# for div in div_list:
# url_temp = div.xpath('./div/div/div/div/div//@href').extract_first()
# url = 'https://www.toutiao.com/a'+url_temp.split('/',3)[2]
# self.urls.append(url)
# for href in self.urls:
# yield scrapy.Request(href,callback=self.parse_model)
##############
for div in div_list:
url_temp = div.xpath('./div/div/div/div/div//@href').extract_first()
#链接拼接
url = 'https://www.toutiao.com/a'+url_temp.split('/',3)[2]
self.urls.append(url)
print("--------")
print(url)
print("---------")
#控制爬取数量
while self.index<=self.number:
# for href in self.urls:
# yield scrapy.Request(href, callback=self.parse_model)
index = self.index
yield scrapy.Request(self.urls[index], callback=self.parse_model)
#获取10篇文章后刷新文章列表
#有些关键字一页若没有这个数,则需调低阈值
if self.num == 5:
#滑动滚动条
self.bro1.execute_script('window.scrollTo(0, document.body.scrollHeight)')
sleep(5)
page_text = self.bro1.page_source
print("if里面")
#print(page_text)
new_response = HtmlResponse(url='',body=page_text,encoding='utf-8', request='')
# self.artical_list(page_text)
self.artical_list(new_response)
self.num = 0
self.index = self.index + 1
print("if中的index", self.index)
else:
print("else里面",self.num)
self.index = self.index + 1
print("else中的index",self.index)
self.num = self.num + 1
#文章解析
def parse_model(self,response):
title = response.xpath('//*[@id="root"]/div/div[2]/div[1]/div[2]/h1/text()').extract_first()
content = response.xpath('//*[@id="root"]/div/div[2]/div[1]/div[2]/article//text()').extract()
content = ''.join(content)
span = response.xpath('//*[@id="root"]/div/div[2]/div[1]/div[2]/div[1]/span')
if len(span) == 2:
author = response.xpath('//*[@id="root"]/div/div[2]/div[1]/div[2]/div[1]/span[1]/text()').extract_first()
time = response.xpath('//*[@id="root"]/div/div[2]/div[1]/div[2]/div[1]/span[2]/text()').extract_first()
else:
author = response.xpath('//*[@id="root"]/div/div[2]/div[1]/div[2]/div[1]/span[2]/text()').extract_first()
time = response.xpath('//*[@id="root"]/div/div[2]/div[1]/div[2]/div[1]/span[3]/text()').extract_first()
#提交管道
#self.bro1.excute_script('window.scrollTo(0, document.body.scrollHeight)')
item = ToutiaoproItem()
item['title'] = title
item['content'] = content
item['time'] = time
item['author'] = author
yield item
#列表解析
def artical_list(self,new_response):
# 获取到列表属性
div_list = new_response.xpath('/html/body/div/div[4]/div[2]/div[3]/div/div/div')
num_urls = len(self.urls)
num_div = len(div_list)
for div in range(num_urls,num_div):
href_temp = div_list[div].xpath('./div/div/div/div/div//@href').extract_first()
href_temp = 'https://www.toutiao.com/a'+href_temp.split('/',3)[2]
self.urls.append(href_temp)
print("!!!!!!!!!!!!!")
print(href_temp)
print("!!!!!!!!!!!!")
中间件middewares
其中PROXY_https列表中可加入自己的ip代理池,我写的都是不可用的。加入代理后将process_request注释关掉,和process_exception注释关掉即可使用代理池。
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
from scrapy.http import HtmlResponse
from time import sleep
import random
class ToutiaoproDownloaderMiddleware:
#代理池
PROXY_https = [
'120.83.49.90:9000',
'95.189.112.214:35508',
]
def process_request(self, request, spider):
# ip = random.choice(self.PROXY_https)
# request.meta['proxy'] = 'https://' + ip
return None
def process_response(self, request, response, spider):
bro1 = spider.bro1 #文章列表浏览器
bro2 = spider.bro2 #具体文章浏览器
#请求地址在文章列表里
if request.url in spider.urls:
bro2.get(request.url)
sleep(2)
page_text = bro2.page_source
new_response = HtmlResponse(url=request.url, body=page_text, encoding='utf-8', request=request)
return new_response
else:
bro1.get(request.url)
sleep(2)
page_text = bro1.page_source
response = HtmlResponse(url=request.url, body=page_text, encoding='utf-8', request=request)
return response
def process_exception(self, request, exception, spider):
#代理异常
# ip = random.choice(self.PROXY_https)
# request.meta['proxy'] = 'https://' + ip
#
# return request
pass
管道pipelines
我这是将数据存放在mysql,只需要在代码中补充自己的mysql相关信息即可。注意的是,需要创建一个python数据库,还有toutiao表,字段为title,content,time,author。content为text类型,其他是varchar类型。
import pymysql
class ToutiaoproPipeline:
def process_item(self, item, spider):
tiele = item['title']
#print(tiele)
return item
class mysqlPipeLine(object):
conn = None
cursor = None
def open_spider(self,spider):
self.conn = pymysql.Connect(host='自己MYSQL的地址',port=3306,user='MYSQL用户名',password='MYSQL密码',db='python',charset='utf8')
def process_item(self,item,spider):
self.cursor = self.conn.cursor()
try:
tiele = item['title']
print(tiele)
self.cursor.execute('insert into toutiao (title,content,time,author) values("%s","%s","%s","%s")'%(item["title"],item["content"],item["time"],item["author"]))
self.conn.commit()
except Exception as e:
print(e)
self.conn.rollback()
return item
def close_spider(self,spider):
self.cursor.close()
self.conn.close()
好像差不多就是这样了,我这还是挺基础的,适合练手,还可以通过这个模板去爬取今日头条的评论等其他。
项目地址在https://github.com/github-plus/toutiao