一般方法
91.403s/page
多线程
多线程的使用方法
if __name__ == "__main__":
conn, cur = mysql.mysql_conn()
mysql_client = mysql.MysqlORM(conn, cur)
t1=time.time()#记录开始时间
moving=list()
get_page(1)#测试一页数据
for i in range(6):
move=threading.Thread(target=get_page,args=(i,))
move.start()
moving.append(move)
for m in moving:
m.join()
t2=time.time()#记录结束时间
print("耗时{}".format(round(t2-t1,3)))
多线程增加了访问频率,因此需要设置ip代理,即用一个短期有效的ip访问目标网站,设置代理
def get_proxy(website):
while True:
resp = requests.get(f'http://proxy.aigauss.com/proxy/next/{website}')
try:
j = resp.json()
p = f'{j["ip"]}:{j["port"]}'
print('使用代理 %s' % p)
return {
'http': p,
'https': p
}
except:
print(f'获取代理异常: {resp.text}')
time.sleep(2)
30.154s/page
异步(scrapy框架)
理解要点
1.spider
spider负责管理请求的数据内容和解析响应中的数据,核心是业务逻辑
。中间的请求过程由engine接受,扔给scheduler调度,scheduler将它交给downloader middlewares。
2.middlewares
middlewares负责请求过程中header、proxy设置以及接受错误响应的重发。重点是发送请求和接受响应
(这里指常用的download middleware)
3.item
item 定义了数据格式,如返回到数据库中的几个字段名
4.pipeline
在 spider中的yield item后pipeline可以捕获。就是在这将数据存入文件或者数据库的
scrapy文件目录
scrapy.cfg作为配置文件要和jd一样在根目录文件下,其规定了配置信息,最重要的是settings.py作为settings,还可以设置数据库连接信息、部署等
在settings中设置了scrapy框架中各种开关 如图所示
BOT_NAME = 'jd'
SPIDER_MODULES = ['jd.spiders']
NEWSPIDER_MODULE = 'jd.spiders'
LOG_LEVEL = 'INFO' #日志信息 debug info warning error
CONCURRENT_REQUESTS = 2 #设置访问速度每秒最多2次request
RETRY_TIMES = 5 #设置重试的最大次数
DOWNLOAD_TIMEOUT = 10 # request超时时间
DOWNLOADER_MIDDLEWARES = { #设置下载器中间件端口的开启
"jd.middlewares.ProxyMiddleware": 543,
}
ITEM_PIPELINES = { #设置数据库流水端口
'jd.pipelines.MysqlPipeline': 300,
}
DEFAULT_REQUEST_HEADERS = {
'authority': 'p.3.cn',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36',
'accept': '*/*',
'referer': 'https://search.jd.com/',
'accept-language': 'zh-CN,zh;q=0.9',
# 'Cookie': 'token=0af978cad27e9de4ef38e1d557b933c9,1,909184'
}
ROBOTSTXT_OBEY = False
T1:24.4s/page
第一次成功获取了266条记录,有32个商品信息因为重复请求超过3次被放弃。
T2:34.0s/page
后来增加retry_times=5,成功获取了261条丢弃了2条
middleware.py
拦截页面处理+加入代理
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
import logging
import requests
import re
import time
from scrapy.exceptions import IgnoreRequest
from scrapy.downloadermiddlewares.retry import RetryMiddleware
logger = logging.getLogger(__name__)
#用这个方法打印日志 spider中用self.logger
class ProxyMiddleware(RetryMiddleware):
def get_proxy(self, website):
"""
获取代理,代理响应格式:
{
ip: "119.114.96.77",
ttl: 261,
protocol: "http",
port: 19396,
time: "2021-03-05 14:53:57"
}
"""
while True:
response = requests.get(url=f'http://proxy.aigauss.com/proxy/next/{website}')
if response.status_code == 200:
proxy = response.json()
if proxy:
logger.debug('更换代理: %s', proxy)
return "https://%(ip)s:%(port)s" % proxy
else:
time.sleep(2)
logger.info('未获取到代理,等待两秒,重新获取')
else:
raise ValueError('获取代理异常')
def process_request(self, request, spider):
request.meta['proxy'] = self.get_proxy('jd')
@staticmethod
def validation_requests(res):
a = [
r"{'code':200,'limit':1}",
r"<script>window.location.href='https://passport.jd.com/new/login.aspx?"
]
for b in a:
if re.search(b, res):
logger.warning(f'响应中出现 {b} 异常')
return True
def process_response(self, request, response, spider):
if self.validation_requests(response.text):
logger.warning(f"错误响应, 请求: {request.url},代理:{request.meta['proxy']}")
#retry 用框架的重复请求函数
req = self._retry(request, "错误响应", spider)
if req is not None:
return req
else:
raise IgnoreRequest
return response
jd_refer.py
import json
import scrapy
import re
from scrapy import Request
from jd.items import JdItem
import time
class JDSpider(scrapy.Spider):
page=1
count=0
name = 'jd_refer'
start_urls = ['https://search.jd.com/Search?keyword=饮料&qrst=1&stock=1&pvid=d9f1f93b10b84ccbb39ffc80192158e8&" \
"page=1&s=1&click=0']
t1=time.time()
def turn_page(self):
self.page=self.page+2
def parse(self, response):
self.logger.info(f"处理page{self.page}开始")
for li in response.xpath('//div[@id="J_goodsList"]//li'):
sku = li.xpath('.//@data-sku').get()
spu = li.xpath('.//@data-spu').get()
if spu == '':
self.logger.warn(f"spu is null {sku}")
spu = sku
url = "https://cd.jd.com/description/channel?skuId={}&mainSkuId={}" \
"&charset=utf-8&cdn=2&callback=showdesc".format(sku, spu)
# 这里开始异步 不等待请求结束开始下一个请求,后续由extract_pic处理
yield Request(url=url, callback=self.extract_pic, cb_kwargs={'sku':sku})
#break
self.logger.info(f"处理page{self.page}结束")
self.turn_page()
page = self.page
if page <= 20:
self.logger.info(f"翻页{page}")
nex= 'https://search.jd.com/Search?keyword=奶粉&qrst=1&stock=1&pvid=d9f1f93b10b84ccbb39ffc80192158e8&" \
"s=1&page={}&click=0'.format(page)
yield Request(url=nex, callback=self.parse)
def extract_pic(self, response, **kwargs):
"""
解析图片
:param response:
:param kwargs:
:return:
"""
# print("22222")
sku = kwargs['sku']
doc = response.text[9:-1]
doc = json.loads(doc)
try:
res = scrapy.Selector(text=doc["content"])
except Exception as e:
self.logger.error(f"出现错误 {doc} {e.args}")
lst = []
try:
imgcon = res.xpath("//style/text()").get()
image_row = re.compile(r'\.ssd-module-wrap \.(.*?)\{.*?\((.*?)\)')
image_info = re.findall(image_row, imgcon)
for image in image_info:
item = {
'image_id': image[0],
'image_url': 'https:' + image[1]
}
lst.append(item['image_url'])
except TypeError:
# 访问异常的错误编号和详细信息
for i in res.xpath("//img//@data-lazyload"):
link=i.get()
if link.startswith("http"):
link=link[5:]
img_link = "https:" + link#[2:]
# print("img_link",img_link)
lst.append(img_link)
except Exception as e:
self.logger.error(f"出现错误 {response.text} {e.args}")
url = "https://item-soa.jd.com/getWareBusiness?skuId={}".format(sku)
# print("price_url",url)
self.logger.info(f'获取到图片链接 {lst}')
yield Request(url=url, callback=self.extract_price, cb_kwargs={'sku': sku, 'pic': lst})
def extract_price(self, response, **kwargs):
dic = kwargs
self.logger.info(f'获取价格信息中……')
try:
price = response.json()["price"]["p"]
dic['price'] = price
self.logger.info(f'获取到商品价格 {price}')
except:
self.logger.error(f'获取商品价格异常:{response.text}')
url = "https://item.jd.com/{}.html".format(kwargs['sku'])
yield Request(url=url, callback=self.extract_goods_desc, cb_kwargs=dic, dont_filter=True)
def extract_goods_desc(self, response,**kwargs):
res = response.text
response=scrapy.Selector(text=res)
dic = kwargs
item = JdItem()
#获取商品名称
name=''
for i in response.xpath('//div[@class="sku-name"]//text()'):#
# 在名称前面有多个image 文字取最后一个
name = i.get().strip()
self.logger.info(f'imginfo{i}{name}')
item['name']=name
lst = []
#获取商品 介绍信息
count=0
for li in response.xpath('//div[@class="p-parameter"]//li'):
if count== 0:
stra="品牌:"
stra=stra+li.xpath('./a//text()').get()
else:
stra = li.xpath('.//text()').get()
print(f"stra数据{stra}")
lst.append(stra)
count = count+1
intro = ';'.join(lst)
size= {}
for dl in response.xpath('//div[@class="Ptable-item"]/dl/dl'):
keys = dl.xpath('.//dt//text()').get()
values = dl.xpath('.//dd//text()').get()
size[keys] = values
item['intro']=str(intro)
item['size2pack'] = str (size)
item['sku']=dic['sku']
item['price']=dic['price']
item['pic']=str(dic['pic'])
self.logger.info(f"获取到商品描述 {item}")
self.count=self.count+1
self.logger.info(f"获得第{count}条数据")
yield item
item.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class ListItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
sku=scrapy.Field()
spu=scrapy.Field()
class JdItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
sku=scrapy.Field()
name=scrapy.Field()
price=scrapy.Field()
intro=scrapy.Field()
size2pack=scrapy.Field()
pic=scrapy.Field()
pipeline.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import pymysql
from pymysql.cursors import DictCursor
import logging
logger = logging.getLogger(__name__)
class MysqlPipeline(object):
"""
同步操作
"""
def __init__(self):
# 建立连接
self.conn = pymysql.connect(
host='localhost',
user='root', passwd='123454321',
database='practice',
port=3306,
charset='utf8mb4'
) # 有中文要存入数据库的话要加charset='utf8'
# 创建游标
self.cur = self.conn.cursor(DictCursor)
print("建立连接")
def insert_one(self, table: str, data: dict):
name = ','.join(data.keys())
print(name)
col = ','.join('%({})s'.format(k) for k in data.keys())
print('col',col)
sql = f'insert ignore into {table}({name}) values({col})'
self.cur.execute(sql, data)
self.conn.commit()
rowid = self.cur.lastrowid
print(f'{table} 插入一条数据 {rowid}')
return rowid
def process_item(self, item, spider):
logger.info(f'item,type{type(dict(item))}')
self.insert_one('jd',dict(item))
def close_spider(self, spider):
# 关闭游标和连接
self.cur.close()
self.conn.close()
debug.py
from scrapy.cmdline import execute
execute('scrapy crawl jd_refer'.split())