这里用的是json+re+requests+beautifulsoup+多线程
1 importjson2 importre3 from multiprocessing.pool importPool4
5 importrequests6 from bs4 importBeautifulSoup7 from config import *
8 from requests importRequestException9
10
11 defget_page_index(offset, keyword):12 '''得到一个页面的索引'''
13 data ={14 'offset': offset,15 'format': 'json',16 'keyword': keyword,17 'autoload': 'true',18 'count': '20',19 'cur_tab': '1',20 'from': 'search_tab'
21 }22 #请求方式一
23 #url = 'https://www.toutiao.com/search_content/?'+urlencode(data)
24 #response = requests.get(url)
25
26 #请求方式二
27 url = 'https://www.toutiao.com/search_content/'
28 try:29 response = requests.get(url, params=data)30 if respons