python-东方财富网贴吧文本数据爬取
分享一下写论文时爬数据用到的代码,有什么问题或者改善的建议的话小伙伴们一起评论区讨论。
def getzssz399001DATA(x,y): #爬取x至y页的帖子
import requests
from bs4 import BeautifulSoup
output = open('zssz399001_f_{}_{}.xls'.format(x,y),'w',encoding='utf-8')
output.write('read\tcomment\thref\tposter\ttitle\ttime\ttime2\tpost\n')
for i in range(y,x,-1):
url = "http://guba.eastmoney.com/list,zssz399001,f_{}.html".format(i)
r1 = requests.get(url)
r1.raise_for_status()
soup = BeautifulSoup(r1.content,"html.parser")
l1a1 = soup.find_all('span',class_ = "l1 a1")
l2a2 = soup.find_all('span',class_