文章目录
前情回顾
settings.py常用变量
# 1、设置日志级别
LOG_LEVEL=""
# 2、保存到日志文件(不在终端输出)
LOG_FILE="xxx.log"
# 3、设置数据导出编码(主要针对于json文件)
FEED_EXPORT_ENCODING="utf8"
# 4、非结构化数据存储路径
IMAGES_STORE="/home/tarena/so/images"
#windows下定义存储路径的两种方式
IMAGES_STORE="D:\\so\\images"(双斜杠,防止转义)
IMAGES_STORE="D:/so/images"
# 5、设置User-Agent
User-Agent="Mozilla/5.0"
# 6、设置最大并发数(默认为16)
CONCURRENT_REQUESTS=32
# 7、下载延迟时间(每隔多长时间请求一个网页)
DOWNLOAD_DELAY=0.5
# 8、请求头
DEFAULT_REQUEST_HEADERS={
"COOKIES":"",
"REFERER":"",
"User-Agent":"",
}
# 9、添加项目管道
ITEM_PIPELINES={
"项目目录名.pipeline.类名":优先级:1-1000,数字越小,优先级越高
}
# 10、添加下载器中间件
DOWNLOADER_MIDDLEWAREA={
"项目目录名.middlewares.类名":优先级
}
#11、cookies:默认禁用cookies
COOKIE_ENABLED= False | True
1.设置为False和True都是开启cookies
2.False:cookies在settings中DEFAULT_REQUEST_HEADERS验证,
3.True:cookies在爬虫文件中scrapy_Formdata表单cookies中验证。
非结构化数据抓取
1、spider
yield item['链接']
2、pipelines.py
from scrapy.pipelines.images import ImagesPipeline
import scrapy
class TestPipeline(ImagesPipeline):
def get_media_requests(self,item,info):
yield scrapy.Request(url=item['url'],meta={'item':item['name']})
def file_path(self,request,response=None,info=None):
name = request.meta['item']
filename = name
return filename
3、settings.py
IMAGES_STORE = 'D:\\Spider\\images'
scrapy.Request()
# 参数
1、url
2、callback
3、headers
4、meta :传递数据,定义代理
5、dont_filter :是否忽略域组限制 - 默认False,检查allowed_domains['']
#不检查:don_filter=True
6、cookies
# request属性
1、request.url
2、request.headers
3、request.meta
4、request.method(默认get)
# response属性
1、response.url
2、response.text
3、response.body
4、response.meta
5、response.encoding
设置中间件
随机User-Agent
# 1、middlewares.py - headers属性
class RandomUaDownloaderMiddleware(object):
def process_request(self,request,spider):
request.headers["User-Agent"]=agent
# 2、settings.py
DOWNLOADER_MIDDLEWARES = {'xxx.middlewares.xxx':300}
随机代理
# 1、middlewares.py - meta属性
class RandomProxyDownloaderMiddleware(object):
def process_request(self,request,spider):
request.meta["proxy"]=proxy
#一直发送请求
def process_exception(self,request,spider):
return request
# 2、settings.py
DOWNLOADER_MIDDLEWARES = {'xxx.middlewares.xxx':200}
今日笔记
分布式爬虫
分布式爬虫介绍
- 原理
多台主机共享1个爬取队列
- 实现
重写scrapy调度器(scrapy_redis模块)
- 为什么使用redis
1、Redis基于内存,速度快
2、Redis非关系型数据库,Redis中集合,存储每个request的指纹
3、scrapy_redis安装
sudo pip3 install scrapy_redis
scrapy_redis详解
-
GitHub地址
https://github.com/rmax/scrapy-redis
-
settings.py说明
# 重新指定调度器: 启用Redis调度存储请求队列 SCHEDULER = "scrapy_redis.scheduler.Scheduler" # 重新指定去重机制: 确保所有的爬虫通过Redis去重 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" # 不清除Redis队列: 暂停/恢复/断点续爬/增量爬取(默认:false) SCHEDULER_PERSIST = True # 优先级队列 (默认) SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue' #可选用的其它队列 # 先进先出队列 SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.FifoQueue' # 后进先出队列 SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.LifoQueue' # redis管道 ITEM_PIPELINES = { #真正把数据存入到redis数据库 'scrapy_redis.pipelines.RedisPipeline': 300 } #指定连接到redis时使用的端口和地址 REDIS_HOST = 'localhost' REDIS_PORT = 6379
腾讯招聘分布式改写
1、正常项目数据抓取(非分布式)
2、改写为分布式(同时存入redis)
1、settings.py
# 1、使用scrapy_redis的调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 2、使用scrapy_redis的去重机制
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 3、是否清除请求指纹,True:不清除 False:清除(默认)
SCHEDULER_PERSIST = True
# 4、(非必须)在ITEM_PIPELINES中添加redis管道
'scrapy_redis.pipelines.RedisPipeline': 200
# 5、定义redis主机地址和端口号
REDIS_HOST = '127.0.0.1'
REDIS_PORT = 6379
改写为分布式(同时存入mysql)
- 修改管道
ITEM_PIPELINES = {
'Tencent.pipelines.TencentPipeline': 300,
# 'scrapy_redis.pipelines.RedisPipeline': 200
'Tencent.pipelines.TencentMysqlPipeline':200,
}
- 清除redis数据库
flushdb
- 代码拷贝一份到分布式中其他机器,两台或多台机器同时执行此代码
腾讯招聘分布式改写- 方法二
-
使用redis_key改写
# 第一步: settings.py无须改动 settings.py和上面分布式代码一致 # 第二步:tencent.py from scrapy_redis.spiders import RedisSpider class TencentSpider(RedisSpider): # 1. 去掉start_urls # 2. 定义redis_key redis_key = 'tencent:spider' def parse(self,response): pass # 第三步:把代码复制到所有爬虫服务器,并启动项目 # 第四步 到redis命令行,执行LPUSH命令压入第一个要爬取的URL地址 >LPUSH tencent:spider 第1页的URL地址 # 项目爬取结束后无法退出,如何退出? setting.py CLOSESPIDER_TIMEOUT = 3600 # 到指定时间(3600秒)时,会自动结束并退出
scrapy - post请求
- 方法+参数
scrapy.FormRequest(
url=posturl,
formdata=formdata,
callback=self.parse
)
- 有道翻译案例实现
1、创建项目+爬虫文件
# -*- coding: utf-8 -*-
import json
import random
import time
from hashlib import md5
import scrapy
from ..items import YoudaoItem
class YoudaoSpider(scrapy.Spider):
name = 'youdao'
allowed_domains = ['fanyi.youdao.com']
# start_urls = ['http://fanyi.youdao.com/']
word=input("请输入要翻译的单词:")
#重写
def start_requests(self):
"""
1.url参数:定义post的URL地址
2.formdata参数:定义form表单数据为字典
3.交给调度器如队列
"""
post_url='http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule'
ts, salt, sign = self.get_ts_salt_sign(self.word)
formdata={
"i": self.word,
"from": "AUTO",
"to": "AUTO",
"smartresult": "dict",
"client": "fanyideskweb",
"salt": salt,
"sign": sign,
# "ts": ts,
# "bv": "65313ac0ff6808a532a1d4971304070e",
"doctype": "json",
"version": "2.1",
"keyfrom": "fanyi.web",
"action": "FY_BY_REALTlME",
}
# cookies=self.get_cookies()
yield scrapy.FormRequest(
url=post_url,
formdata=formdata,
callback=self.parse,
# cookies=cookies, 在中间件中使用
)
# def get_cookies(self):
# cs="Cookie: P_INFO=xinwei_user; OUTFOX_SEARCH_USER_ID=1087533395@10.169.0.83; JSESSIONID=aaay3E4LlPWDr64vjDn3w; OUTFOX_SEARCH_USER_ID_NCOO=1784198643.0122821; ___rl__test__cookies=1571131720098"
# cs_list=cs.split("; ")
# cs_dict={}
# for c in cs_list:
# cs_dict[c.split("=")[0]]=c.split("=")[1]
#
# return cs_dict
def get_ts_salt_sign(self, word):
ts = str(random.randint(0, 9))
salt = ts + str(int(time.time() * 1000))
string = "fanyideskweb" + word + salt + "n%A-rKaT5fb[Gy?;N5@Tj"
s = md5()
s.update(string.encode())
sign = s.hexdigest()
return ts, salt, sign
def parse(self, response):
#1.获取翻译结果item["result"]
item=YoudaoItem()
html=json.loads(response.text)
item['result']=html["translateResult"][0][0]["tgt"]
yield item
2、items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class YoudaoItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
result=scrapy.Field()
3、middleware.py
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class YoudaoSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class YoudaoDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
#给请求对象包装cookies
class YoudaoCookieMiddleware(object):
def process_request(self,request,spider):
request.cookies=self.get_cookies()
def get_cookies(self):
cs="Cookie: P_INFO=xinwei_user; OUTFOX_SEARCH_USER_ID=1087533395@10.169.0.83; JSESSIONID=aaay3E4LlPWDr64vjDn3w; OUTFOX_SEARCH_USER_ID_NCOO=1784198643.0122821; ___rl__test__cookies=1571131720098"
cs_list=cs.split("; ")
cs_dict={}
for c in cs_list:
cs_dict[c.split("=")[0]]=c.split("=")[1]
return cs_dict
4、settings.py
#cookies:默认禁用cookies
COOKIE_ENABLED= False | True
1.设置为False和True都是开启cookies
2.False:cookies在settings中DEFAULT_REQUEST_HEADERS验证,
3.True:cookies在爬虫文件中scrapy.FormRequest表单cookies中验证
scrapy添加cookie的三种方式
# 1、修改 settings.py 文件
1、COOKIES_ENABLED = False # 取消注释,表示启用cookie,使用变量中Cookie值
2、DEFAULT_REQUEST_HEADERS = {} 添加Cookie
# 2、爬虫文件 - 利用cookies参数(将cookies处理成字典)
COOKIES_ENABLED = TRUE # 启用Cookie,使用Request()方法中cookies参数
def start_requests(self):
yield scrapy.Request(url=url,cookies={},callback=xxx)
eg:
cookies=self.get_cookies()
yield scrapy.FormRequest(
url=post_url,
formdata=formdata,
callback=self.parse,
# cookies=cookies, 在中间件中使用
)
def get_cookies(self):
cs="Cookie: P_INFO=xinwei_user; OUTFOX_SEARCH_USER_ID=1087533395@10.169.0.83; JSESSIONID=aaay3E4LlPWDr64vjDn3w; OUTFOX_SEARCH_USER_ID_NCOO=1784198643.0122821; ___rl__test__cookies=1571131720098"
cs_list=cs.split("; ")
cs_dict={}
for c in cs_list:
cs_dict[c.split("=")[0]]=c.split("=")[1]
return cs_dict
# 3、DownloadMiddleware设置中间件
COOKIES_ENABLED = TRUE # 启用Cookie,使用Request()方法中cookies参数
def process_request(self,request,spider):
request.cookies={}
eg:
#给请求对象包装cookies
class YoudaoCookieMiddleware(object):
def process_request(self,request,spider):
request.cookies=self.get_cookies()
def get_cookies(self):
cs="Cookie: P_INFO=xinwei_user; OUTFOX_SEARCH_USER_ID=1087533395@10.169.0.83; JSESSIONID=aaay3E4LlPWDr64vjDn3w; OUTFOX_SEARCH_USER_ID_NCOO=1784198643.0122821; ___rl__test__cookies=1571131720098"
cs_list=cs.split("; ")
cs_dict={}
for c in cs_list:
cs_dict[c.split("=")[0]]=c.split("=")[1]
return cs_dict
机器视觉与tesseract
作用
处理图形验证码
三个重要概念
- OCR
# 定义
OCR: 光学字符识别(Optical Character Recognition)
# 原理
通过扫描等光学输入方式将各种票据、报刊、书籍、文稿及其它印刷品的文字转化为图像信息,再利用文字识别技术将图像信息转化为电子文本
- tesserct-ocr
OCR的一个底层识别库(不是模块,不能导入)
# Google维护的开源OCR识别库
- pytesseract
Python模块,可调用底层识别库
# 对tesseract-ocr做的一层Python API封装
安装tesseract-ocr
- Ubuntu
sudo apt-get install tesseract-ocr
- Windows
1、下载安装包
2、添加到环境变量(Path)
- 测试
# 终端 | cmd命令行
tesseract xxx.jpg 文件名
安装pytesseract
- 安装
sudo pip3 install pytesseract
python -m pip3 install pytesseract
#离线安装
1.官网下载安装包 - xxx.tar.gz
2.解压:tar -zxvf xxx.tar.gz
3.cd 解压后的文件夹找 REDAME和setup.py
4.sudo python3 setup.py install
- 使用
import pytesseract
#标准库模块:图片处理
from PIL import Image
img=Image.open("xxx.jpg")
code=pytesseract.image_to_string(img)
- 爬取网站思路(验证码)
1、获取验证码图片
2、使用PIL库打开图片
3、使用pytesseract将图片中验证码识别并转为字符串
4、将字符串发送到验证码框中或者某个URL地址
在线打码平台
- 为什么使用在线打码
tesseract-ocr识别率很低,文字变形、干扰,导致无法识别验证码
- 云打码平台使用步骤
1、下载并查看接口文档
2、调整接口文档,调整代码并接入程序测试
3、真正接入程序,在线识别后获取结果并使用
-
破解云打码网站验证码
1、下载并调整接口文档,封装成函数,打码获取结果
import http.client, mimetypes, urllib, json, time, requests
######################################################################
class YDMHttp:
apiurl = 'http://api.yundama.com/api.php'
username = ''
password = ''
appid = ''
appkey = ''
def __init__(self, username, password, appid, appkey):
self.username = username
self.password = password
self.appid = str(appid)
self.appkey = appkey
def request(self, fields, files=[]):
response = self.post_url(self.apiurl, fields, files)
response = json.loads(response)
return response
def balance(self):
data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey}
response = self.request(data)
if (response):
if (response['ret'] and response['ret'] < 0):
return response['ret']
else:
return response['balance']
else:
return -9001
def login(self):
data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey}
response = self.request(data)
if (response):
if (response['ret'] and response['ret'] < 0):
return response['ret']
else:
return response['uid']
else:
return -9001
def upload(self, filename, codetype, timeout):
data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)}
file = {'file': filename}
response = self.request(data, file)
if (response):
if (response['ret'] and response['ret'] < 0):
return response['ret']
else:
return response['cid']
else:
return -9001
def result(self, cid):
data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid)}
response = self.request(data)
return response and response['text'] or ''
def decode(self, filename, codetype, timeout):
cid = self.upload(filename, codetype, timeout)
if (cid > 0):
for i in range(0, timeout):
result = self.result(cid)
if (result != ''):
return cid, result
else:
time.sleep(1)
return -3003, ''
else:
return cid, ''
def report(self, cid):
data = {'method': 'report', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid), 'flag': '0'}
response = self.request(data)
if (response):
return response['ret']
else:
return -9001
def post_url(self, url, fields, files=[]):
for key in files:
files[key] = open(files[key], 'rb');
res = requests.post(url, files=files, data=fields)
return res.text
######################################################################
def get_result(filename):
# 用户名
username = 'yibeizi001'
# 密码
password = 'zhanshen002'
# 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得!
appid = 1
# 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得!
appkey = '22cc5376925e9387a23cf797cb9ba745'
# 图片文件
# filename = 'getimage.jpg'
# 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html
codetype = 3000
# 超时时间,秒
timeout = 60
# 初始化
yundama = YDMHttp(username, password, appid, appkey)
# 登陆云打码
uid = yundama.login()
# 查询余额
balance = yundama.balance()
# 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果
cid, result = yundama.decode(filename, codetype, timeout)
return result
######################################################################
2、访问云打码网站,获取验证码并在线识别
from selenium import webdriver
from PIL import Image
from day11.Verification_code.ydmapi import *
class YdmSpider(object):
def __init__(self):
self.url = 'http://www.yundama.com/'
self.browser = webdriver.Chrome()
# 1.获取首页截图: index.jpg
def get_index(self):
self.browser.get(self.url)
self.browser.save_screenshot('index.png')
# 2.获取验证码截图: cache.jpg
def get_cachejpg(self):
# 1.找验证码节点位置(x y坐标)
location = self.browser.find_element_by_xpath('//*[@id="verifyImg"]').location
# 2.大小(宽度和高度)
size = self.browser.find_element_by_xpath('//*[@id="verifyImg"]').size
# 左上角x y坐标
left = location['x']
top = location['y']
# 右下角x y坐标
right = left + size['width']
bottom = top + size['height']
# 3.截图验证码图片 - crop((x,y,x,y))
img = Image.open('index.png').crop((left, top, right, bottom))
img.save('cache.png')
# 云打码在线识别
def get_cache(self):
result = get_result('cache.png')
return result
# 入口函数
def run(self):
self.get_index()
self.get_cachejpg()
result = self.get_cache()
print(result)
if __name__ == '__main__':
spider = YdmSpider()
spider.run()
Fiddler抓包工具
- 配置Fiddler
# 添加证书信任
1、Tools - Options - HTTPS
勾选 Decrypt Https Traffic 后弹出窗口,一路确认
# 设置只抓取浏览器的数据包
2、...from browsers only
# 设置监听端口(默认为8888)
3、Tools - Options - Connections
# 配置完成后重启Fiddler(重要)
4、关闭Fiddler,再打开Fiddler
- 配置浏览器代理
1、安装Proxy SwitchyOmega插件
2、浏览器右上角:SwitchyOmega->选项->新建情景模式->AID1901(名字)->创建
输入 :HTTP:// 127.0.0.1 8888
点击 :应用选项
3、点击右上角SwitchyOmega可切换代理
- Fiddler常用菜单
1、Inspector :查看数据包详细内容
整体分为请求和响应两部分
2、常用菜单
Headers :请求头信息
WebForms: POST请求Form表单数据 :<body>
GET请求查询参数: <QueryString>
Raw
将整个请求显示为纯文本
移动端app数据抓取
方法1 - 手机 + Fiddler
设置方法见文件夹 - 移动端抓包配置
方法2 - F12浏览器工具
有道翻译手机版破解案例
爬虫总结
# 1、什么是爬虫
爬虫是请求网站并提取数据的自动化程序
# 2、robots协议是什么
爬虫协议或机器人协议,网站通过robots协议告诉搜索引擎哪些页面可以抓取,哪些页面不能抓取
# 3、爬虫的基本流程
1、请求得到响应
2、解析
3、保存数据
# 4、请求
1、urllib
2、requests
3、scrapy
# 5、解析
1、re正则表达式
2、lxml+xpath解析
3、json解析模块
# 6、selenium+browser
# 7、常见反爬策略
1、Headers : 最基本的反爬手段,一般被关注的变量是UserAgent和Referer,可以考虑使用浏览器中
2、UA : 建立User-Agent池,每次访问页面随机切换
3、拉黑高频访问IP
数据量大用代理IP池伪装成多个访问者,也可控制爬取速度
4、Cookies
建立有效的cookie池,每次访问随机切换
5、验证码
验证码数量较少可人工填写
图形验证码可使用tesseract识别
其他情况只能在线打码、人工打码和训练机器学习模型
6、动态生成
一般由js动态生成的数据都是向特定的地址发get请求得到的,返回的一般是json
7、签名及js加密
一般为本地JS加密,查找本地JS文件,分析,或者使用execjs模块执行JS
8、js调整页面结构
9、js在响应中指向新的地址
# 8、scrapy框架的运行机制
# 9、分布式爬虫的原理
多台主机共享一个爬取队列
BeautifulSoup解析模块
-
定义
html和xml解析模块依赖于lxml
-
安装
sudo pip3 install beautifulsoup4
-
使用流程
from bs4 import BeautifulSoup #1.创建解析对象(必须依赖于第三方库) soup=BeautifulSoup(html,"lxml") #2.调用find_all()方法 r_list=soup.find_all(节点,条件)
-
BeautifulSoup支持的解析库
- lxml:速度快,文档容错力强
- html.parser:都一般
- xml:速度快,文档容错力强
-
常用方法
a. find():找到1个节点
b. find_all():列表
-
节点.get_text():文本内容
-
实例:
-
r_list=soup.find_all( "div", attrs={"id":"nav"} )
-
-
-
示例代码
from bs4 import BeautifulSoup as bs html=""" <div class="test">卡卡</div> <div class="test">啦啦</div> """ soup=bs(html,"lxml") r_list=soup.find_all("div",attrs={"class":"test"}) print(r_list) #提取数据 for r in r_list: print(r.get_text())
-
链家二手房
import requests, time, random from bs4 import BeautifulSoup from fake_useragent import UserAgent class LianJiaSpider: def __init__(self): self.url = "https://zz.lianjia.com/ershoufang/pg{}/" self.blag=1 # 随机headers def get_headers(self): agent = UserAgent().random headers = {"User-Agent": agent} return headers # 请求 def get_html(self, url): if self.blag<=3: try: res = requests.get(url=url, headers=self.get_headers(),timeout=5) # html=res.text html = res.content.decode() return html except Exception as e: print(e) self.blag+=1 self.get_html(url) # 解析 def parse_html(self, url): html = self.get_html(url) if not html: return None soup=BeautifulSoup(html,"lxml") l_list=soup.find_all("li",attrs={"class":"clear LOGVIEWDATA LOGCLICKDATA"}) for l in l_list: item={} adr=l.find("div",attrs={"class":"positionInfo"}).get_text() item["name"]=adr.split("-")[0].strip()+"——"+adr.split("-")[1].strip() info_list=l.find("div",attrs={"class":"houseInfo"}).get_text().split("|") if len(info_list)==7: # print(info_list) item["model"] = info_list[0].strip() item["area"] = info_list[1].strip() item["direction"] = info_list[2].strip() item["perfect"] = info_list[3].strip() item["floor"]=info_list[4].strip() item["tower"]=info_list[-1].strip() else: item["model"] = item["area"] = item["direction"] = item["perfect"] = item["tower"]=None item["total"] =l.find("div",attrs={"class":"totalPrice"}).get_text() item["unit"]=l.find("div",attrs={"class":"unitPrice"}).get_text() print(item) def run(self): for i in range(1,20): url=self.url.format(i) self.parse_html(url) # time.sleep(random.randint(1,3)) #没抓取一页要初始化self.blag self.blag=1 if __name__ == '__main__': l = LianJiaSpider() l.run()