python爬取今日头条街拍_python多线程爬取-今日头条的街拍数据（附源码加思路注释）...-CSDN博客

本文链接：https://blog.csdn.net/weixin_34729012/article/details/113672379

这里用的是json+re+requests+beautifulsoup+多线程

1 importjson2 importre3 from multiprocessing.pool importPool4

5 importrequests6 from bs4 importBeautifulSoup7 from config import *

8 from requests importRequestException9

11 defget_page_index(offset, keyword):12 ‘‘‘得到一个页面的索引‘‘‘

13 data ={14 ‘offset‘: offset,15 ‘format‘: ‘json‘,16 ‘keyword‘: keyword,17 ‘autoload‘: ‘true‘,18 ‘count‘: ‘20‘,19 ‘cur_tab‘: ‘1‘,20 ‘from‘: ‘search_tab‘

21 }22 #请求方式一

23 #url = ‘https://www.toutiao.com/search_content/?‘+urlencode(data)

24 #response = requests.get(url)

26 #请求方式二

27 url = ‘https://www.toutiao.com/search_content/‘

28 try:29 response = requests.get(url, params=data)30 if response.status_code == 200:31 returnresponse.text32 returnNone33 exceptRequestException:34 returnNone35

37 defparse_page_index(html):38 ‘‘‘解析json数据‘‘‘

39 data =json.loads(html)40 if data and ‘data‘ indata.keys():41 for item in data.get(‘data‘):42 yield item.get(‘article_url‘)43

45 defget_page_detail(url):46 ‘‘‘得到详情页的数据‘‘‘

47 #添加的请求头

48 headers ={49 ‘user-agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36‘,50 }51 try:52 response = requests.get(url, headers=headers)53 if response.status_code == 200:54 returnresponse.text55 returnNone56 exceptRequestException:57 returnNone58

60 defparse_page_detail(html, url):61 ‘‘‘解析详情页数据‘‘‘

62 soup = BeautifulSoup(html, ‘lxml‘)63 t = soup.select(‘title‘)64 for i int:65 title =i.get_text()66

67 pattern = re.compile(‘gallery: JSON.parse\("(.*?)"\),‘, re.S)68 result =re.search(pattern, html)69 ifresult:70

71 #print(result.group(1))

72 d = re.sub(‘\\\\‘, ‘‘, result.group(1))73 #print(d)

74 data =json.loads(d)75 ifdata:76 images = [item.get(‘url‘) for item in data.get(‘sub_images‘)]77 for image inimages:78 download_image(image, title)79 return{80 ‘title‘: title,81 ‘url‘: url,82 ‘images‘: images83 }84 else:85 None86

88 defdownload_image(url, title):89 ‘‘‘

90 图片下载91 :param url: 下载的连接92 :return:93 ‘‘‘

94 print(‘正在下载‘, url)95 try:96 response =requests.get(url)97 if response.status_code == 200:98 content =response.content99 save_to_image(content, title)100 returnNone101 exceptRequestException:102 returnNone103

104

105 count =0106

107

108 defsave_to_image(content, title):109 globalcount110 ‘‘‘

111 保存图片文件112 :param content: 图片文件的内容113 :return:114 ‘‘‘

115 name = title +str(count)116 file_path = ‘./头条/{}.{}‘.format(name, ‘jpg‘)117 with open(file_path, ‘wb‘) as f:118 count += 1

119 f.write(content)120

121

122 defmain(offset):123 ‘‘‘主程序入口‘‘‘

124 html = get_page_index(offset, ‘街拍‘)125

126 #print(html)

127 for url inparse_page_index(html):128