scrapy 配合selenium使用的注意问题
今天使用scrapy 的中间件对request 进行封装!测试开始是一个网页,完美通过,然后准备大量爬!结果出来bug ,先上代码,或者其他大佬有更好的解决方法!!!!!有的请留言 让我学习一下! 关于这个scrapy框架 以后会上博客写使用详情 !
##小项目是这样写:
import scrapy
from ..items import YDoubanItem
from lxml import etree
class DoubanSpider(scrapy.Spider):
name = 'douban'
allowed_domains = ['book.douban.com']
start_urls = []
n=1
base_url="https://book.douban.com/subject_search?search_text=python&cat=1001&start=%d"
for i in range(119):
url=base_url%(i*15)
start_urls.append(url)
def parse(self, response):
print('=====================================================')
contents = response.body.decode("utf-8")
with open("douban%d.html"%self.n,"w",encoding="utf-8") as fp:
fp.write(contents)
html_tree = etree.HTML(contents)
books_tree = html_tree.xpath('//div[@id="root"]//div[@class="sc-dnqmqq eXEXeG"]')
for i in books_tree:
img_url = i.xpath('./div//img[@class="cover"]/@src')
title = i.xpath('./div//a[@class="title-text"]/text()')
pingfen = i.xpath('./div//span[@class="rating_nums"]/text()')
pingjia = i.xpath('./div//span[@class="pl"]/text()')
zuozhe = i.xpath('./div//div[@class="meta abstract"]/text()')
for a, b, c, d, e in zip(img_url, title, pingfen, pingjia, zuozhe):
item=YDoubanItem()
item["img_url"]=a
item['title']=b
item['pingfen']=c
item['pingjia']=d
item['zuozhe']=e
print(a,b,c,d,e)
yield item
self.n+=1
#接下来是中间件
from selenium import webdriver
from scrapy.conf import settings
from scrapy.http import HtmlResponse,Response
import time, random
from fake_useragent import UserAgent
class YDoubanDownloaderMiddleware(object):
def process_request(self, request, spider):
try:
#driver = webdriver.Chrome()
driver = webdriver.PhantomJS(executable_path=r'D:\ysc桌面\Desktop\phantomjs-2.1.1-windows\bin\phantomjs.exe')
url = request.url
driver.get(url)
html = driver.page_source
#driver.quit()
except:
print("request!!!!!!!!!!!!!!!!!!")
return False
else:
return HtmlResponse(url=url, body=html, encoding="utf-8", request=request)
# 自定义代理中间件
class ProxyMiddleware(object):
def process_request(self, request, spider):
ua = UserAgent()
try:
proxies = settings.get("PROXIES")
proxy = random.choice(proxies)
user_agent=ua.random
request.meta['proxy'] = proxy['host']
request.meta['user-agent']=user_agent
except:
print("proxy!!!!!!!!!!!!!!!!!!")
return False
else:
print("ok")
return None
#接下来管道
import pymongo
class YDoubanPipeline(object):
def __init__(self):
self.client=pymongo.MongoClient("localhost")
self.db=self.client["test"]
self.table=self.db['douban']
def process_item(self, item, spider):
self.table.insert(dict(item))
return item
出现的bug就是 中间件使用
#driver = webdriver.Chrome()
会导致 爬到第二页的时候就获取不到数据,发生异常,返回不到response,一直弄了很久都不行!如有大牛知道请告知!!!!
后来改用了
webdriver.PhantomJS(executable_path=r’D:\ysc桌面\Desktop\phantomjs-2.1.1-windows\bin\phantomjs.exe’)
获取网页数据正常 写了119个网页数据没有问题!
所以需要用到 webdriver 配合使用 scrapy
尽量使用无界面浏览器 ,具体有界面浏览器的问题在研究着!
746

被折叠的 条评论
为什么被折叠?



