1 需求
上市公司贵州茅台股吧帖子爬取。
2 代码实现
from selenium import webdriver
import re
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
browser = webdriver.Chrome(options=chrome_options)
browser.get('http://guba.eastmoney.com/list,600519.html')
data = browser.page_source
p_title = '<span class="l3 a3"><a href=.*?title="(.*?)</a></span>'
title = re.findall(p_title, data)
p_href = '<span class="l3 a3">.*?<a href="(.*?)" title="'
href = re.findall(p_href, data)
for index in range(len(title)):
title[index] = re.sub('<.*?>', '', title[index]).strip()
print(str(1 + index) + "." + title[index])
print("(" + href[index] + ")")
print()