最近看知乎有的人虐狗总会贴上自己的女(百)票(度)的照骗,正好今天趁着38妇女节,抽时间来一波你的女票颜值到底多少分
先去知乎找几个类似 有个漂亮女朋友###的话题,找些评论多的,找知乎的评论接口很简单就不说了
setting.py
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 2
DEFAULT_REQUEST_HEADERS = {
'Host': "www.zhihu.com",
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:65.0) Gecko/20100101 Firefox/65.0",
'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
'Accept-Language': "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
'Connection': "keep-alive",
'Upgrade-Insecure-Requests': "1",
'Pragma': "no-cache",
'Cache-Control': "no-cache",
'TE': "Trailers",
'cache-control': "no-cache",}
DOWNLOADER_MIDDLEWARES = {
'Zhihu_beaute.middlewares.Zhihu_beauteUser_AgentMiddleware': 543,
}
ITEM_PIPELINES = {
'Zhihu_beaute.pipelines.ZhihuBeautePipeline': 300,
}
因为评论不多,所已加点延迟就不去掉代理ip了,
item.py 中我只需要评论名字和图片url所以:
class ZhihuBeauteItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = scrapy.Field()
img_url = scrapy.Field()
pass
pipelines.py 就去保存图片就行
class ZhihuBeautePipeline(object):
def process_item(self, item, spider):
headers ={'Host': "pic4.zhimg.com",
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:65.0) Gecko/20100101 Firefox/65.0",
'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
'Accept-Language': "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
'Connection': "keep-alive",
'Upgrade-Insecure-Requests': "1",
'Pragma': "no-cache",
'Cache-Control': "no-cache",
'TE': "Trailers",
'cache-control': "no-cache",
}
name = item['name']
urls = item['img_url']
i = 1
for url in urls:
if not re.findall('http',url):
url = 'https://pic2.zhimg.com' + url
print('!!!',url)
req = requests.get(url,headers = headers)
# print(os.getcwd())
time.sleep(0.5)
with open('xxxx/xx/xxx/{}-{}.jpg'.format(name,i),'wb') as f:
f.write(req.content)
i+=1
return item
spider.py 很简单的逻辑
class ZhuhuBeauteSpider(scrapy.Spider):
name = 'zhihu_beaute'
allowed_domains = ['zhihu.com','pic3.zhimg.com']
url = 'https://www.zhihu.com/api/v4/questions/285906324/answers?include=data[*].is_normal,admin_closed_comment,reward_info,is_collapsed,annotation_action,annotation_detail,collapse_reason,is_sticky,collapsed_by,suggest_edit,comment_count,can_comment,content,editable_content,voteup_count,reshipment_settings,comment_permission,created_time,updated_time,review_info,relevant_info,question,excerpt,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp,is_labeled;data[*].mark_infos[*].url;data[*].author.follower_count,badge[*].topics&limit=5&offset={}&platform=desktop&sort_by=default'
page = 0
start_urls = [url.format(page)]
def parse(self, response):
item = ZhihuBeauteItem()
result = json.loads(response.body.decode())
# print(result)
for item in result['data']:
name = item['author']['name']
content = item['content']
soup = BeautifulSoup(content,'lxml')
imgs = soup.select('img')
img_list = []
if not imgs:
pass
else:
i = 1
for img in imgs:
img_url = img.get('src')
if re.findall('^http',img_url):
img_list.append(img_url)
# yield scrapy.Request(img_url,callback=self.downloadimg,meta ={'name':name,'i':i})
# i +=1
item['name'] = name
item['img_url'] = img_list
yield item
self.page += 1
if self.page >100 :
return
yield scrapy.Request(self.url.format(self.page),callback=self.parse)
跑完没问题的话你的文件夹会有很多很多小姐姐照片,扫一眼还是挺好看的
接下来就去百度ai 找人脸识别 先获取你自己的access_token 再去掉这个sdk将你的文件夹文件逐一去调用,然后去拿返回的分数
access_token = ###自己的###
headers = {'Content-Type':'application/json'}
images_path = os.listdir('images')
print(images_path)
for i in images_path:
with open('images/{}'.format(i),'rb') as f :
image_code = f.read()
b64 =base64.b64encode(image_code)
b64 = str(b64,encoding='UTF-8')
url = 'https://aip.baidubce.com/rest/2.0/face/v3/detect?access_token=###自己的###'
data = {'image':b64,
'image_type':'BASE64',
'face_field':'age,beauty,gender,glasses,race,quality,eye_status,emotion,face_type'
}
time.sleep(0.5)
req = requests.post(url,data=data,headers =headers)
print(req.text)
result = json.loads(req.text)
if result['error_code'] == 0:
if result['result']['face_list'][0]['gender']['type'] == 'female':
age = result['result']['face_list'][0]['age']
beauty = int(result['result']['face_list'][0]['beauty'])/10
print(age,beauty)
os.rename('images/{}'.format(i),'images/{}'.format(str(beauty)+i))
shutil.move('images/{}'.format(str(beauty)+i),'imagesnew/{}'.format(str(beauty)+i))
else:
os.remove('images/{}'.format(i))
else:
os.remove('images/{}'.format(i))
看一下8分以上颜值