python爬今日头条图片_Python3用requests,multiprocessing多线程爬取今日头条图片

仅供交流学习

#coding=utf-8

import json

import requests

import re

import os

from multiprocessing import Pool

from urllib.parse import urlencode

from fake_useragent import UserAgent

from hashlib import md5

from bs4 import BeautifulSoup

ua=UserAgent()

keyword="街拍"

def get_page(offset):

param={

'offset': offset,

'format':'json',

'keyword':keyword,

'autoload':'true',

'count': 20

}

base="https://www.toutiao.com/api/search/content/?"

url=base+urlencode(param)

content=get_content(url)

data=json.loads(content)

if data and "data" in data.keys():

article_list=data.get('data')

return [item.get('article_url') for item in article_list]

return None

#保存结果到文件

def write_to_file(content):

with open("res.txt","a",encoding="utf-8") as f:

f.write(content)

#解析获取内页的图片

def parse_page_image(url):

content=get_content(url)

if content!=None:

#获取标题

soup=BeautifulSoup(content,'lxml')

res=soup.select('title')

title=res[0].get_text()

pattern=re.compile(r'gallery: JSON.parse\("(.*?)"\),',re.S)

items=pattern.findall(content)

for item in items:

item=eval("'{}'".format(item))

data=json.loads(item)

if data and "sub_images" in data.keys():

items=[item.get("url") for item in data.get('sub_images')]

res={

'title':title,

'imgList':items,

'url':url

}

write_to_file(json.dumps(res,ensure_ascii=False)+"\n")

for url in items:

get_img(url)

#保存图片

def save_img(content):

path_file="{0}/{1}/{2}.{3}".format(os.getcwd(),"img",md5(content).hexdigest(),"jpg")

print(path_file)

with open(path_file,"wb") as f:

f.write(content)

#获取远程图片

def get_img(url):

try:

headers={'User-Agent':ua.chrome}

response=requests.get(url,headers=headers)

if response.status_code==200:

save_img(response.content)

except:

pass

#获取文本内容

def get_content(url):

try:

headers={'User-Agent':ua.chrome}

response=requests.get(url,headers=headers)

if response.status_code==200:

return response.text

return None

except:

return None

def main(offset):

items=get_page(offset)

if items!=None:

for item in items:

parse_page_image(item)

if __name__=='__main__':

pool=Pool()

pool.map(main,[i*10 for i in range(10)])

pool.close()

pool.join()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值