#!/usr/bin/python
# !coding=utf-8
import os
import requests
import re
from urllib.parse import urlencode
from hashlib import md5
from multiprocessing.pool import Pool
def get_page(offset):
base_url = "https://www.toutiao.com/search_content/?"
params = {
"offset": offset,
"format": "json",
"keyword": "火影劫",
"autoload": "true",
"count": "20",
"cur_tab": "1",
"from": "search_tab",
"pd": "synthesis"
}
url = base_url + urlencode(params)
try:
response = requests.get(url)
if response.status_code == 200:
return response.json()
except requests.ConnectionError:
return None
def get_images(json):
if json:
for item in json.get("data"):
if item.get("open_url") is not None:
title = item.get("title")
content = re.sub("[\s+\.\!\/_,::$%^*“”《》(+\"\']+|[+——!,。??、~@#¥%……&*()]+", "", title)
images = item.get("image_url")
yield {
"title": content,
"image": "http://" + images[2:]
}
def save_images(item):
if not os.path.exists(item["title"]):
os.mkdir(item["title"])
try:
response = requests.get(item["image"])
path = "{0}/{1}.{2}".format(item["title"], md5(response.content).hexdigest(), "jpg")
with open(path, "wb") as f:
f.write(response.content)
except requests.ConnectionError:
print("Have Error!")
def main():
for offset in range(0, 2):
json = get_page(offset * 20)
for item in get_images(json):
save_images(item)
GROUP_START = 1
GROUP_END = 1
if __name__ == "__main__":
pool = Pool()
groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
pool.map(main, groups)
pool.close()
pool.join()
python ajax批量下载
最新推荐文章于 2024-05-01 21:59:49 发布