1、爬取经历
爬取豆瓣电影排行信息是一个经典的爬取案例,我开始用scrapy框架的时候一切OK,就没有设置下载旳延时,也没有更换请求头和使用代理IP,结果等我设置的时候为时已晚,结果中了豆瓣的封禁IP,真是郁闷她妈给郁闷开门,郁闷到家了。
上面是是报错信息, 这是服务器使用了访问频次检查,如果一个ip在短时间内访问服务器次数过于频繁,且cookies相同,则会被判定为机器人,你会被要求登录后再访问服务器或者输入验证码,甚至直接封禁你的ip。后面经过中间件技术把登录后的cookie复制过来总算是解决了。是普通的requests请求也是可以的,所以我后面会再用requests模块再实现一次。
2、案例简介
上面只是我爬取的经历,但是并不影响代码的运行,小伙伴们直接复制我这边scrapy的代码是完全能用的。这个案例主要是给大家介绍下中间件的使用和详情页的信息获取。
3、scrapy中间件
Spider中间件(Spider Middleware):处理Spider的输入输出(request和response),可以对Spider的请求或者响应进行加工,例如给请求加上cookie、修改请求头、使用IP代理,嵌入selenium模块等,我们一般使用下载器中间件里面的类或者自己写。
3.1 使用随机的请求头
有的网站发现同一个请求头多次发出请求,就会进行限制。我们通过随机函数来实现,首先将请求头列表放进settings文件。
USER_AGENT_LIST =[
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
'Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML like Gecko) Chrome/20.0.1132.57 Safari/536.11',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML like Gecko) Chrome/20.0.1092.0 Safari/536.6',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML like Gecko) Chrome/20.0.1090.0 Safari/536.6',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML like Gecko) Chrome/19.77.34.5 Safari/537.1',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML like Gecko) Chrome/19.0.1084.9 Safari/536.5',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML like Gecko) Chrome/19.0.1084.36 Safari/536.5',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML like Gecko) Chrome/19.0.1063.0 Safari/536.3']
在middleware文件里面进行随机选择,如果请求头太多,可以放进py文件,那就在中间件里面读取外部文件到列表,然后随机选择。
3.2 使用IP代理
有的网站对客户端有更高的限制,就是看IP,上面已经说过了,所以要更换请求的IP,网上有免费的可以找些试试,不一定能行,也可以用收费的IP代理,是要花钱的,它生成一些随机的IP,有相关的Python语言代码模块,直接拿来就可以了 。
PROXY_LIST=[
{"ip_port":"122.239.152.93:19649"},
{"ip_port":"114.220.38.60:16529"},
{"ip_port":"175.155.189.104:15542"},
{"ip_port":"1.198.72.188:18790"},
{"ip_port":"110.83.146.60:19907"},
{"ip_port":"125.117.192.225:19179"},
]
ROBOTSTXT_OBEY = False
#CONCURRENT_REQUESTS = 32
DOWNLOAD_DELAY = 2
DOWNLOADER_MIDDLEWARES = {
"doubanmovie.middlewares.RandomUserAgent": 543,
"doubanmovie.middlewares.RandomProxy": 400,
"Douban.middlewares.DoubanDownloaderMiddleware": 543,
}
3.3 由于被封ip,只能通过登录后复制本机的cookie来解决,不要忘了要用本机的user-agent。这时可以把IP的代理给关闭。
下面是middleware文件的内容
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
import random
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
#from .myextend import pro
# from Douban.settings import USER_AGENT_LIST
from Douban.settings import PROXY_LIST
def user_agent(ual_filename):
result=[]
with open(ual_filename, encoding='utf-8') as f:
for line in f:
result.append(line.strip('\n').split(',')[0])
return result
def get_cookies_dict( ):
cookies_str = 'll="118194"; bid=NAAxHVpQdHg; _pk_id.100001.4cf6=e0f43914dae24c65.1689117206.; __yadk_uid=AfPjqpxLQe29ciBk29CEYr5DNFcvDNii; _vwo_uuid_v2=D8F73DD4876185DC0437D2847108D2D2B|d1abf23696346fa31bd756f2ac7c83b0; dbcl2="272094971:I8efVP2LGEE"; push_noty_num=0; push_doumail_num=0; __utmv=30149280.27209; ck=LqJW; __utmc=30149280; __utmc=223695111; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1689486612%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DeeWpDtlQCs9OZKvKoBAosZVK23ukLV09Jnl7x_-yNZp-V-pSez48xWaRc_Y4coec%26wd%3D%26eqid%3Dc2e7c5df000f36230000000564b3850e%22%5D; _pk_ses.100001.4cf6=1; ap_v=0,6.0; __utma=30149280.2070209389.1689117202.1689472604.1689486612.15; __utmz=30149280.1689486612.15.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utma=223695111.1307584475.1689117206.1689472604.1689486612.13; __utmb=223695111.0.10.1689486612; __utmz=223695111.1689486612.13.4.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; Hm_lvt_16a14f3002af32bf3a75dfe352478639=1689486614; Hm_lpvt_16a14f3002af32bf3a75dfe352478639=1689486614; frodotk_db="d2966fef77b687a177c34fce672396e3"; __utmt=1; __utmb=30149280.2.10.1689486612; __gads=ID=0086f844a64ce4fa-2260f22253e20020:T=1689117214:RT=1689488333:S=ALNI_MagDq0vLEgXuI8For-64RRsgipDJg; __gpi=UID=00000c1fedf587dc:T=1689117214:RT=1689488333:S=ALNI_MYDeYvYR1JeMdFpevIqAeQAQyqtTA'
cookies_dict={}
for item in cookies_str.split('; '):
key,value = item.split('=',maxsplit=1)
cookies_dict[key] = value
return cookies_dict
# class RandomUserAgent(object):
# def process_request(self,request,spider):
# #print(request.headers['user_agent'])
# AGENT_LIST = user_agent(useragnetfile)
# ub =random.choice(AGENT_LIST)
# #ua =random.choice(USER_AGENT_LIST)
# request.headers['user_agent'] = ub
cookies = get_cookies_dict()
# class RandomProxy(object):
# def process_request(self, request, spider):
# proxy = random.choice(PROXY_LIST)
# print(proxy)
# if 'user_passwd' in proxy:
# #对账号密码进行编码
# b64_up = base64.b64encode(proxy['user_passwd'].encode())
# #设置认证
# request.headers['Proxy-Authorization'] = 'Basic ' + b64_up.decode()
# #设置代理
# request.meta['proxy'] = proxy['ip_port']
# else:
# #设置代理
# request.meta['proxy'] = proxy['ip_port']
class DoubanDownloaderMiddleware:
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
request.cookies = cookies
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)
4、详情页的数据获取
要获取详情页的数据,比如豆瓣电影的故事简介和影评,是在每一个电影链接的详情页,所以就要通过start_url获得每个详情页的url,再次发出请求,送给其他的函数进行解析提取相关的数据,这里面要传递一个字典参数过去,这样就可以把详情页提取的数据增加到原来的数据字典里面。
不过这个地方有个坑,item = DoubanItem()一定要放到for循环里面,如果在外面创建这个对象的话,获取的首页的数据无法传递到item对象。
import scrapy
from Douban.items import DoubanItem
class MovieSpider(scrapy.Spider):
name = "movie"
allowed_domains = ["douban.com"]
start_urls = ["https://movie.douban.com/top250"]
def parse(self, response):
#print(response.text)
node_list =response.xpath('//*[@class="info"]')
#print(node_list)
for node in node_list:
item = DoubanItem()
item['name']=node.xpath('./div[1]/a/span[1]/text()').extract_first()
item['info']=str(node.xpath('./div[2]/p[1]/text()[1]').extract_first()).strip().replace('\xa0','')
item['category']=str(node.xpath('./div[2]/p[1]/text()[2]').extract_first()).strip().replace('\xa0','')
item['score']=node.xpath('./div[2]/div/span[2]/text()').extract_first()
item['desc']=node.xpath('./div[2]/p[2]/span/text()').extract_first()
item['link']=node.xpath('./div[1]/a/@href').extract_first()
#print(item)
detail_url = response.urljoin(item['link'])
#print(detail_url)
yield scrapy.Request(
url = detail_url,
callback = self.parse_detail,
meta={'item':item},
)
#翻页的判断条件,如果能取到下一页的链接就递归调用自身解析数据。如果不能取到下一页的链接就结束递归。
next_url=response.xpath('//*[@id="content"]/div/div[1]/div[2]/span[3]/a/@href').extract_first()
if next_url!=None:
next_url =response.urljoin(next_url)
yield scrapy.Request(
url = next_url,
callback=self.parse
)
def parse_detail(self,response):
item = response.meta['item']
item['story'] = str(response.xpath('//*[@id="link-report-intra"]/span[1]/span/text()').extract()).strip().replace("'",'').replace("\\u3000",'').replace('\\n','')
item['review'] = str(response.xpath('//*[@id="hot-comments"]//span[@class="short"]//text()').extract()).strip().replace('\\n','').replace("'",'').replace("\\u3000",'')
yield item
这样就把数据全部获取到了,可以通过Mysql数据库保存数据,这个代码我就不写了,小伙伴可以参考 这篇博文看下就可以了。Scrapy爬虫框架案例学习之三(爬取网易招聘页面信息写入Mysql数据库)_u010152658的博客-CSDN博客
但是我这里面有2点避坑说明:一是建表的字段不要用desc这个关键字,python可以用但是mysql语句里面不行 ,1063报错;第二获取的影评里面的英文双引号要替换掉,这个会在插入数据时报错。有时不是太好找原因,只能把真实语句放到mysql里面跑,看看报什么错。
5、将数据写入excel文件
import openpyxl
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class DoubanPipeline:
# def process_item(self,item,spider):
# print(item)
def __init__(self):
self.wb = openpyxl.Workbook()
self.ws = self.wb.active
self.ws.title = 'Top250'
self.ws.append(('电影名','信息','评分','描述','分类','故事','短评'))
def close_spider(self,spider):
self.wb.save('电影数据.xlsx')
def process_item(self,item,spider):
name = item.get('name','')
info = item.get('info', '')
score = item.get('score','')
desc = item.get('desc','')
category = item.get('category','')
story = item.get('story','')
review = item.get('review', '')
self.ws.append((name,info,score,desc,category,story,review))
return item
6、requests模块实现豆瓣电影top250的数据获取
这个我是时不得已,硬着头皮写的,不然自己给自己弄一鼻子灰无法收场,条条大路通罗马,不管黑猫白猫把,就一个文件。请求头和IP的文件放到同级目录下即可,要导入pymysql。
import random
import requests
#from mysql_function import *
import lxml.etree as etree
import time,pymysql
from scrapy import Selector
def sql_select(sql):
db = pymysql.connect( host='127.0.0.1', port=3306, user='root', password='123456', database='mydb', charset='utf8')
cursor = db.cursor()
cursor.execute(sql)
sqlret = cursor.fetchall()
# col_result = cursor.description # 获取查询结果的字段描述,做为表格的头
cursor.close()
db.close()
return sqlret
# 添加一条数据数据
def insertdata(sql):
db = pymysql.connect( host='127.0.0.1', port=3306, user='root', password='123456', database='mydb', charset='utf8')
cursor = db.cursor()
cursor.execute(sql)
# 提交数据
db.commit()
cursor.close()
db.close()
def readFile(path):
content_list = []
with open(path,'r') as f:
for content in f:
content_list.append(content.rstrip())
return content_list
def getHeaders():
user_agent_list = readFile("user_agent.txt")
UserAgent = random.choice(user_agent_list)
headers = {'User-Agent': UserAgent}
print(headers)
return headers
def getIp():
ip_list = readFile('ip.txt')
#print(ip_list)
ip = random.choice(ip_list)
print(ip)
return ip
def checkip(targeturl,ip):
headers =getHeaders() #定制请求头
proxies = {"http": "http://"+ip, "https": "https://"+ip} #代理ip
try:
response=requests.get(url=targeturl,proxies=proxies,headers=headers,timeout=5).status_code
if response == 200 :
return True
else:
return False
except:
return False
def getProxies(url):
ip = getIp()
if checkip(url,ip) is True:
proxies = {'http':'http://'+ip}
print(proxies)
return proxies
else:
return
def get_index_url(url):
proxies = getProxies(url)
header = getHeaders()
response = requests.get(url, headers=header,proxies=proxies)
sel = Selector(response)
node_list =sel.xpath('//*[@class="info"]')
for node in node_list:
item={}
item['moviename']=node.xpath('./div[1]/a/span[1]/text()').extract_first()
item['info']=str(node.xpath('./div[2]/p[1]/text()[1]').extract_first()).strip().replace('\xa0','')
item['category']=str(node.xpath('./div[2]/p[1]/text()[2]').extract_first()).strip().replace('\xa0','')
item['score']=node.xpath('./div[2]/div/span[2]/text()').extract_first()
item['desc']=node.xpath('./div[2]/p[2]/span/text()').extract_first()
item['link']=node.xpath('./div[1]/a/@href').extract_first()
item_con = get_detail_url(item['link'])
item['story'] = item_con['story']
item['review'] = item_con['review']
#写数据库
insert_emp_sql = f'''insert IGNORE into movietoptable (moviename,info,category,score,descripe,story,review)
values ("{item['moviename']}","{item['info']}","{item['category']}","{item['score']}","{item['desc']}","{item['story']}","{item['review']}")'''
insertdata(insert_emp_sql)
print('已成功向数据库插入一条记录')
time.sleep(1)
next_url_part=sel.xpath('//*[@id="content"]/div/div[1]/div[2]/span[3]/a/@href').extract_first()
print(next_url_part)
if next_url_part!=None:
url= 'https://movie.douban.com/top250'
next_url = url + next_url_part
print(next_url)
get_index_url(next_url)
def get_detail_url(url):
# proxies = getProxies(url)
# header = getHeaders()
header = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
}
response = requests.get(url, headers=header)
sel = Selector(response)
item = {}
item['story'] = str(sel.xpath('//*[@id="link-report-intra"]/span[1]/span/text()').extract()).strip().replace("'",'').replace('\\u3000','').replace('\\n','').replace(' ','').replace('\\r','').replace('"','')
item['review'] = str(sel.xpath('//*[@id="hot-comments"]//span[@class="short"]//text()').extract()).strip().replace('\n','').replace("'",'').replace('\\n','').replace(' ','').replace('\\r','').replace('"','')
return item
if __name__ == '__main__':
url= 'https://movie.douban.com/top250'
create_music_table = ''' CREATE TABLE IF NOT EXISTS `movietoptable`(
`moviename` varchar(255) Not NULL,
`info` varchar(255) default NULL,
`category` varchar(255) default NULL,
`score` varchar(255) default NULL,
`descripe` varchar(255) default NULL,
`story` varchar(500) default NULL,
`review` varchar(1000) default NULL,
PRIMARY KEY(moviename)
)ENGINE=INNODB DEFAULT CHARSET='utf8';
'''
sql_select(create_music_table)
get_index_url(url)
最后结果上图:
这里面有几部电影还是值得一看的,我就不给大家介绍了,descripe的简短的人生哲理让人回味,“不要跟我比惨,我比你更惨”,任何时候都不要自暴自弃,活着就是这样,咬咬牙就坚持下来了。
github下载地址GitHub - wangluixn/scrapy_doubantop250
本案例仅供学习使用,亲测可用。