python爬取今日头条街拍_python多线程爬取-今日头条的街拍数据(附源码加思路注释)...

这里用的是json+re+requests+beautifulsoup+多线程

1 importjson2 importre3 from multiprocessing.pool importPool4

5 importrequests6 from bs4 importBeautifulSoup7 from config import *

8 from requests importRequestException9

10

11 defget_page_index(offset, keyword):12 ‘‘‘得到一个页面的索引‘‘‘

13 data ={14 ‘offset‘: offset,15 ‘format‘: ‘json‘,16 ‘keyword‘: keyword,17 ‘autoload‘: ‘true‘,18 ‘count‘: ‘20‘,19 ‘cur_tab‘: ‘1‘,20 ‘from‘: ‘search_tab‘

21 }22 #请求方式一

23 #url = ‘https://www.toutiao.com/search_content/?‘+urlencode(data)

24 #response = requests.get(url)

25

26 #请求方式二

27 url = ‘https://www.toutiao.com/search_content/‘

28 try:29 response = requests.get(url, params=data)30 if response.status_code == 200:31 returnresponse.text32 returnNone33 exceptRequestException:34 returnNone35

36

37 defparse_page_index(html):38 ‘‘‘解析json数据‘‘‘

39 data =json.loads(html)40 if data and ‘data‘ indata.keys():41 for item in data.get(‘data‘):42 yield item.get(‘article_url‘)43

44

45 defget_page_detail(url):46 ‘‘‘得到详情页的数据‘‘‘

47 #添加的请求头

48 headers ={49 ‘user-agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36‘,50 }51 try:52 response = requests.get(url, headers=headers)53 if response.status_code == 200:54 returnresponse.text55 returnNone56 exceptRequestException:57 returnNone58

59

60 defparse_page_detail(html, url):61 ‘‘‘解析详情页数据‘‘‘

62 soup = BeautifulSoup(html, ‘lxml‘)63 t = soup.select(‘title‘)64 for i int:65 title =i.get_text()66

67 pattern = re.compile(‘gallery: JSON.parse\("(.*?)"\),‘, re.S)68 result =re.search(pattern, html)69 ifresult:70

71 #print(result.group(1))

72 d = re.sub(‘\\\\‘, ‘‘, result.group(1))73 #print(d)

74 data =json.loads(d)75 ifdata:76 images = [item.get(‘url‘) for item in data.get(‘sub_images‘)]77 for image inimages:78 download_image(image, title)79 return{80 ‘title‘: title,81 ‘url‘: url,82 ‘images‘: images83 }84 else:85 None86

87

88 defdownload_image(url, title):89 ‘‘‘

90 图片下载91 :param url: 下载的连接92 :return:93 ‘‘‘

94 print(‘正在下载‘, url)95 try:96 response =requests.get(url)97 if response.status_code == 200:98 content =response.content99 save_to_image(content, title)100 returnNone101 exceptRequestException:102 returnNone103

104

105 count =0106

107

108 defsave_to_image(content, title):109 globalcount110 ‘‘‘

111 保存图片文件112 :param content: 图片文件的内容113 :return:114 ‘‘‘

115 name = title +str(count)116 file_path = ‘./头条/{}.{}‘.format(name, ‘jpg‘)117 with open(file_path, ‘wb‘) as f:118 count += 1

119 f.write(content)120

121

122 defmain(offset):123 ‘‘‘主程序入口‘‘‘

124 html = get_page_index(offset, ‘街拍‘)125

126 #print(html)

127 for url inparse_page_index(html):128

129 ifurl:130 #print(url)

131 html =get_page_detail(url)132 ifhtml:133 #print(parse_page_detail(html, url))

134 result =parse_page_detail(html, url)135 ifresult:136 print(result)137 #save_to_mongo(result)

138

139

140 GROUP_START = 1

141 GROUP_END = 20

142 if __name__ == ‘__main__‘:143 groups = [i * 20 for i inrange(GROUP_START, GROUP_END)]144 pool =Pool()145 pool.map(main, groups)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值