山东大学项目实训
本次实验实现了对头条数据的爬取,基本实现了给一个关键词就能爬取新闻内容及评论
def get_json(self,query,times):
print(times*20)
url = 'https://www.toutiao.com/api/search/content/?'
query = query
data = {
'aid' : '24',
'app_name' : 'web_search',
'offset':times*20,
'format':'json',
'keyword': query,
'autoload':'true',
'count':'20',
'en_qc':'1',
'cur_tab':'1',
'from':'search_tab',
'pd':'synthesis',
'timestamp':'1591334689562',
'_signature':'eogINAAgEBC837G5qXUW1nqJSSAACRUUacJVWMYPAnDzgyhUN8Z50mnnocKxzlDN68ESVavW7QohTkc-zAqL6T3-Yix63Z7STBVenpHHWa-svVRQHbXc9VElqbkshKKcmNI'}
data = urlencode(data)
url = url + data
print(url)
return url
def get_comment(self,url,number):
common = []
id = None
if number==1:
id = url.split('/')[4]
if number==2:
id = url
url = self.get_comment_json(id)
response = requests.get(url, headers=self.comment_headers)
response.encoding = 'utf-8'
html = response.content
json_dict = json.loads(html.decode('utf-8'))
for i in json_dict['data']:
common.append(i['comment']['text'])
return common
def get_comment_json(self,id):
url = 'https://www.toutiao.com/article/v2/tab_comments/?'
'aid=24&app_name=toutiao-web&group_id=6832123300156539399&item_id=6832123300156539399&offset=0&count=5'
id = id
data = {
'aid': '24',
'app_name': 'toutiao-web',
'group_id':id,
'item_id':id,
'offset': '0',
'count':'5'}
data = urlencode(data)
url = url + data
return url
def main(self,query):
f = open('./疫情.txt', 'w+',encoding="utf-8")
while True:
url = self.get_json(query,self.offset)
self.offset += 1
summary_list = []
# time.sleep(30)
response = requests.get(url, headers=self.headers)
response.encoding = 'utf-8'
html = response.content
json_dict = json.loads(html.decode('utf-8'))
# print(json_dict['count'])
if json_dict['count']==0:
print('finish crawling data')
break
for i in json_dict['data']:
try:
mid = i['title'].replace('<em>', '').replace('</em>', '')
times = i['datetime']
try:
comment_url = i['display']['info']['url']
comment_list_=self.get_comment(comment_url,1)
except:
comment_url=i['id']
comment_list_ = self.get_comment(comment_url, 2)
f.write(str(mid) + ',' + str(times) + ',' + str(comment_list_)+'\n')
print(mid, times,comment_list_)
# summary_list.append(mid)
except KeyError:
print(KeyError)
pass
time.sleep(5)
# if self.offset ==4:
# break
f.close()
爬完之后结果是新闻内容 ,时间 ,评论