首先安装必要的库
pip install requests
pip install json
pip install re
代码如下
import datetime
import json
import requests
from requests.exceptions import RequestException
import re
import time
def get_one_page(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
def parse_one_page(html):
pattern = re.compile('<tr class="">.*?<a href="(.*?)" target="_blank">(.*?)</a>.*?</tr>', re.S)
items = re.findall(pattern, html)
for item in items:
yield {
'url': 'https://s.weibo.com'+item[0],
'topic': item[1].replace('#','',2),
}
def write_to_file(content):
with open('热搜榜单.txt', 'a', encoding='utf-8') as f:
f.write('检索时间' + str(datetime.datetime.now())[0:17])
f.write(json.dumps(content, ensure_ascii=False) + '\n')
def main():
name = int(input(' 热搜榜;0;时代榜:1'))
table = ['realtimehot','socialevent']
url = 'https://s.weibo.com/top/summary?cate=' + table[name]
html = get_one_page(url)
for item in parse_one_page(html):
print(item)
write_to_file(item)
if __name__ == '__main__':
main()
通过这个可以与之前的根据关键词检索嵌套,多层循环进行微博爬虫.