文章目录
验证码处理
验证码示例有:http://example.python-scraping.com/user/register
这个验证码可以看到是一个二进制数据,用base64压缩过的
pillow加载web中的二进制验证码
我们想办法用requests拿到这个数据并逆处理让pil显示出来
#!/usr/bin/env python
# encoding: utf-8
from io import BytesIO
from lxml.html import fromstring
from PIL import Image
import base64
import requests
def get_b64_string(html):
tree = fromstring(html)
img_data = tree.cssselect('div#recaptcha img')[0].get('src')
img_data = img_data.partition(',')[-1]
return img_data
def get_captcha_img(html):
tree = fromstring(html)
img_data = tree.cssselect('div#recaptcha img')[0].get('src')
img_data = img_data.partition(',')[-1]
binary_img_data = base64.b64decode(img_data)
img = Image.open(BytesIO(binary_img_data))
return img
def img_to_bw(img):
gray = img.convert('L')
bw = gray.point(lambda x: 0 if x < 1 else 255, '1')
return bw
if __name__ == "__main__":
url = "http://example.python-scraping.com/user/register"
headers = {
'User-Agent': "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10", }
html = requests.get(url, headers=headers)
b64_string = get_b64_string(html.text)
print(b64_string) # 原始的base64数据
img = get_captcha_img(html.text)
print(img)
img.show() # 显示出来
img_to_bw(img).show() # 二值化后的结果
可以看到原始的图片
还有经过二值化后的图片
使用pytesseract处理简单验证码
安装tesseract,下载链接:https://tesseract-ocr.github.io/tessdoc/Downloads,在Windows中只需要保存存在tesseract.exe就行,无论你是哪个版本
遇到问题可以参考大概是这个:
Python tesseract is not installed or it’s not in your path 错误解决方案
安装对应的包
pip install pytesseract
修改一下上面的代码,让其识别二值化后的验证码
#!/usr/bin/env python
# encoding: utf-8
from io import BytesIO
from lxml.html import fromstring
from PIL import Image
import base64
import requests
import pytesseract
import string
def get_b64_string(html):
tree = fromstring(html)
img_data = tree.cssselect('div#recaptcha img')[0].get('src')
img_data = img_data.partition(',')[-1]
return img_data
def get_captcha_img(html):
tree = fromstring(html)
img_data = tree.cssselect('div#recaptcha img')[0].get('src')
img_data = img_data.partition(',')[-1]
binary_img_data = base64.b64decode(img_data)
img = Image.open(BytesIO(binary_img_data))
return img
def img_to_bw(img):
gray = img.convert('L')
bw = gray.point(lambda x: 0 if x < 1 else 255, '1')
return bw
def ocr(img):
bw = img_to_bw(img)
captcha = pytesseract.image_to_string(bw)
cleaned = ''.join(c for c in captcha.lower() if c in string.ascii_lowercase)
if len(cleaned) != len(captcha):
print('removed bad characters: {}'.format(set(captcha) - set(cleaned)))
return cleaned
if __name__ == "__main__":
url = "http://example.python-scraping.com/user/register"
headers = {
'User-Agent': "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10", }
html = requests.get(url, headers=headers)
b64_string = get_b64_string(html.text)
print(b64_string) # 原始的base64数据
img = get_captcha_img(html.text)
print(img)
img.show() # 显示出来
img_to_bw(img).show() # 二值化后的结果
print(ocr(img)) # 输出识别后的结果
识别的效果如下
scrapy
常用的命令如下
- startproject:创建一个新项目
- genspider:根据模板生成一个新爬虫
- crawl:执行爬虫
- shell:启动交互式抓取控制台
第一个项目
我们来使用scrapy生成一个爬虫爬取example.python-scraping.com中的国家
新建一个项目
scrapy startproject example
创建爬虫
下面用模板生成一个爬虫
scrapy genspider country_or_district example.python-scraping.com --template=crawl
在目录中出现一个文件country_or_district.py,里面就是我们的代码
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class CountryOrDistrictSpider(CrawlSpider):
name = 'country_or_district' # 识别爬虫的字符串
allowed_domains = ['example.python-scraping.com'] # 可以爬取的域名列表,如果不设置可以爬取任何域名
start_urls = ['http://example.python-scraping.com/']
rules = (
Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
)
def parse_item(self, response):
i = {}
#i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
#i['name'] = response.xpath('//div[@id="name"]').extract()
#i['description'] = response.xpath('//div[@id="description"]').extract()
return i
把这个文件放到example中的spider文件夹中
优化设置
如果想了解更多细节,可以参考:https://blog.csdn.net/qq_42184699/article/details/92575404
默认scrapy则是同一域名允许最多16个并发下载,并且两次下载直接没有延迟,下面需要做设置,修改setting.py,打开注释并修改,或者直接填入
CONCURRENT_REQUESTS_PER_DOMAIN = 1
DOWNLOAD_DELAY = 3
修改其他配置
items.py下,定义我们需要得到的字段
class CountryOrDistrictItem(scrapy.Item):
name = scrapy.Field()
population = scrapy.Field()
再来修改一个爬虫类country_or_district.py,拿到我们的item
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import CountryOrDistrictItem
class CountryOrDistrictSpider(CrawlSpider):
name = 'country_or_district'
allowed_domains = ['example.python-scraping.com']
start_urls = ['http://example.python-scraping.com/']
rules = (
Rule(LinkExtractor(allow=r'/index/', deny=r'/user/'),
follow=True),
Rule(LinkExtractor(allow=r'/view/', deny=r'/user/'),
callback='parse_item'),
)
def parse_item(self, response):
item = CountryOrDistrictItem()
name_css = 'tr#places_country_or_district__row td.w2p_fw::text'
item['name'] = response.css(name_css).extract()
pop_xpath = '//tr[@id="places_population__row"]/td[@class="w2p_fw"]/text()'
item['population'] = response.xpath(pop_xpath).extract()
return item
测试爬虫
按上述修改完成后,启动爬虫
scrapy crawl country_or_district -s LOG_LEVEL=ERROR
输出如下
不同类型的爬虫
scrapy中的爬虫有如下几个级别
- spider:普通的抓取爬虫,通常只用于抓取一个类型的页面
- crawlspider:爬取爬虫。通常用于遍历域名,能够自动发现页面中的其他链接
- xmlfeedspider:遍历xml流并从每个节点中获取内容
- csvfeedspider:同上,只是换成了csv
- sitemapspider:先解析站点地图,使用不同的规则爬取网站
使用shell命令抓取
scrapy给我们提供了shell命令,方便调试xpath,命令行输入
scrapy shell http://example.python-scraping.com/places/default/view/Antigua-and-Barbuda-10
可以看到调试页面
尝试以下调试,均能正常使用
response.url
name_css='#places_country_or_district__row > td.w2p_fw::text'
response.css(name_css).extract()
检查结果
想要检查结果,可以输出到文件中
scrapy crawl country_or_district --output=./data/output.json -s LOG_LEVEL=INFO
输出后的output.json文件内容如下
中断与恢复
在抓取网站时,暂停功能很重要,比如要重启计算机,都有可能导致中断爬虫
中断需要把中间文件保存下来,用JOBDIR指定
scrapy crawl country_or_district -s LOG_LEVEL=DEBUG -s JOBDIR=./data/country_or_district
点击ctrl+c后,可以看到中断信息
data文件夹的内容如下
运行同样的命令,可以恢复继续爬取
scrapy crawl country_or_district -s LOG_LEVEL=DEBUG -s JOBDIR=./data/country_or_district