这是一个爬取股吧前十页数据的爬虫
import re, json
import requests
def write_to_json(infos):
with open('movies.json', 'w', encoding='utf-8') as fp:
json.dump(infos, fp)
def parse_page(html_str):
ul_p = re.compile(r'<ul class="newlist" tracker-eventcode="gb_xgbsy_ lbqy_rmlbdj">(.*?)</ul>', re.S)
ul_content = ul_p.search(html_str).group()
cite_p = re.compile(r'<ul class="newlist" tracker-eventcode="gb_xgbsy_ lbqy_rmlbdj">(.*?)</ul>', re.S)
cite_list = cite_p.findall(ul_content)
'''
阅读
评论
标题
作者
更新时间
详情页
'''
for cite in cite_list:
cite_q = re.compile(r'<li>(.*?)</li>', re.S)
cite_list2 = cite_q.findall(cite)
for cite2 in cite_list2:
clk_p = re.compile(r'<cite>(.*?)</cite>', re.S)
clk = clk_p.findall(cite2)
read_count = clk