最近有个需求是做人脸表情识别,要求比较简单:识别行人的微笑表情。
自己动手用yolo训练个识别器是容易做的。麻烦的是在哪里寻找训练数据。
以下记录一下用Scrapy爬百度数据的过程
环境:ubuntu 2004 conda python10.0 scrapy 2.7 别忘了Pillow,不安装这个默认不报错。
环境准备完后,
scrapy startproject scrapy_smile #项目名字随意
cd scrapy_smile
scrapy genspider baidu image.baidu.com #创建爬虫
import json
import re
from urllib import parse
import scrapy
from fake_useragent import UserAgent #这个会出问题,下载json文件到本地
from ..items import SmilespiderItem
class BaiduSpider(scrapy.Spider):
name = 'baidu'
allowed_domains = ['image.baidu.com']
word_origin = input("请输入搜索关键字:")
max_down = input("要下载的数量:")
count_per_page = 30
word = parse.quote(word_origin)
max = int(parse.quote(max_down))
# word = '美女'
# max =100
url = "https://image.baidu.com/search/acjson?tn=resultjson_com&logid=11578100820615399278&ipn=rj&ct=201326592&is=&fp=result&fr=&word={}&cg=girl&queryWord={}&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&hd=&latest=©right=&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=1&expermode=&nojc=&isAsync=&pn={}&rn={}&gsm=7800000000000078&1666847835228="
def start_requests(self):
ua=UserAgent(path=r"../fake_useragent.json")
headers = {'User-Agent': str(ua.random)}
for pn in range(0, self.max, self.count_per_page):
# _url = f'{self.url}&pn={pn}'
_url =self.url.format(self.word,self.word,pn,self.count_per_page)
yield scrapy.Request(url=_url, callback=self.parse, headers=headers)
def parse(self, response):
# print(response.text)
regex = '"thumbURL":"(.*?)"'
pattern = re.compile(regex, re.S)
links = pattern.findall(response.text)
item = SmilespiderItem()
item["word"] = self.word
for i in links:
item["link"] = i
yield item
这个url 可能需要经常换。*度有反*虫机制。遇到这种情况换一下url就行了。
主要几个参数名字 word,word,pn,rn 关键字,关键字,开始图片索引,每页的图片个数
下载图片的关键文件pipline,没啥需要说的直接看代码。
import hashlib
import urllib
from scrapy.utils.python import to_bytes
from itemadapter import ItemAdapter
from scrapy import Request
from scrapy.pipelines.images import ImagesPipeline
class SmilespiderPipeline(ImagesPipeline):
# def process_item(self, item, spider):
# print("process_item",item)
# return item
word=''
count = 0
def get_media_requests(self, item, info):
# meta={"word":item['word']}
self.count+=1
print(self.count)
self.word = urllib.parse.unquote( item['word'])
yield Request(item['link'])
def file_path(self, request, response=None, info=None, *, item=None):
image_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
# word = response.meta['word']
# print( response.meta)
fname = f'{self.word}/{image_guid}.jpg'
return fname
设置里面图片存储路径要写
LOG_LEVEL ='WARNING' 根据需要设置
ITEM_PIPELINES = {
'smileSpider.pipelines.SmilespiderPipeline': 100,
}
LOG_LEVEL ='WARNING'
IMAGES_STORE = './images'
最后在项目文件夹根目录写个开始
from scrapy import cmdline
cmdline.execute('scrapy crawl baidu'.split(' '))
试运行一下