这里用的是json+re+requests+beautifulsoup+多线程
1 importjson2 importre3 from multiprocessing.pool importPool4
5 importrequests6 from bs4 importBeautifulSoup7 from config import *
8 from requests importRequestException9
10
11 defget_page_index(offset, keyword):12 ‘‘‘得到一个页面的索引‘‘‘
13 data ={14 ‘offset‘: offset,15 ‘format‘: ‘json‘,16 ‘keyword‘: keyword,17 ‘autoload‘: ‘true‘,18 ‘count‘: ‘20‘,19 ‘cur_tab‘: ‘1‘,20 ‘from‘: ‘search_tab‘
21 }22 #请求方式一
23 #url = ‘https://www.toutiao.com/search_content/?‘+urlencode(data)
24 #response = requests.get(url)
25
26 #请求方式二
27 url = ‘https://www.toutiao.com/search_content/‘
28 try:29 response = requests.get(url, params=data)30 if response.status_code == 200:31 returnresponse.text32 returnNone33 exceptRequestException:34 returnNone35
36
37 defparse_page_index(html):38 ‘‘‘解析json数据‘‘‘
39 data =json.loads(html)40 if data and ‘data‘ indata.keys():41 for item in data.get(‘data‘):42 yield item.get(‘article_url‘)43
44
45 defget_page_detail(url):46 ‘‘‘得到详情页的数据‘‘‘
47 #添加的请求头
48 headers ={49 ‘user-agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36‘,50 }51 try:52 response = requests.get(url, headers=headers)53 if response.status_code == 200:54 returnresponse.text55 returnNone56 exceptRequestException:57 returnNone58
59
60 defparse_page_detail(html, url):61 ‘‘‘解析详情页数据‘‘‘
62 soup = BeautifulSoup(html, ‘lxml‘)63 t = soup.select(‘title‘)64 for i int:65 title =i.get_text()66
67 pattern = re.compile(‘gallery: JSON.parse\("(.*?)"\),‘, re.S)68 result =re.search(pattern, html)69 ifresult:70
71 #print(result.group(1))
72 d = re.sub(‘\\\\‘, ‘‘, result.group(1))73 #print(d)
74 data =json.loads(d)75 ifdata:76 images = [item.get(‘url‘) for item in data.get(‘sub_images‘)]77 for image inimages:78 download_image(image, title)79 return{80 ‘title‘: title,81 ‘url‘: url,82 ‘images‘: images83 }84 else:85 None86
87
88 defdownload_image(url, title):89 ‘‘‘
90 图片下载91 :param url: 下载的连接92 :return:93 ‘‘‘
94 print(‘正在下载‘, url)95 try:96 response =requests.get(url)97 if response.status_code == 200:98 content =response.content99 save_to_image(content, title)100 returnNone101 exceptRequestException:102 returnNone103
104
105 count =0106
107
108 defsave_to_image(content, title):109 globalcount110 ‘‘‘
111 保存图片文件112 :param content: 图片文件的内容113 :return:114 ‘‘‘
115 name = title +str(count)116 file_path = ‘./头条/{}.{}‘.format(name, ‘jpg‘)117 with open(file_path, ‘wb‘) as f:118 count += 1
119 f.write(content)120
121
122 defmain(offset):123 ‘‘‘主程序入口‘‘‘
124 html = get_page_index(offset, ‘街拍‘)125
126 #print(html)
127 for url inparse_page_index(html):128
129 ifurl:130 #print(url)
131 html =get_page_detail(url)132 ifhtml:133 #print(parse_page_detail(html, url))
134 result =parse_page_detail(html, url)135 ifresult:136 print(result)137 #save_to_mongo(result)
138
139
140 GROUP_START = 1
141 GROUP_END = 20
142 if __name__ == ‘__main__‘:143 groups = [i * 20 for i inrange(GROUP_START, GROUP_END)]144 pool =Pool()145 pool.map(main, groups)