缘起
我的好朋友的毕业论文需要爬取基金经理的新闻数量,并且统计新闻数量与基金的成交率的关系,我当然义不容辞啦。
任务描述:爬取三百位基金经理“百度新闻”中的搜索结果,并且将其分别按月和按季度统计新闻数量。
使用到的技术
beatifulsoup,urllib, request,python文件I/O
Talk is cheap,show the code
主函数:GCWspider_main.py
import url_manager,html_downloader,html_parser,html_output
import xlwt
import xlrd
import urllib
class SpiderMain(object):
def __init__(self):
self.urls=url_manager.UrlManager()
self.downloader=html_downloader.HtmlDownoader()
self.parser=html_parser.HtmlParser()
self.output=html_output.HtmlOutputer()
def craw(self,sheet1,sheet2,root_url,num,name):
count=1
listZeros=[0]
resultlistM=listZeros*((2016-2000)*12)
resultlistS = listZeros * ((2016 - 2000) * 4)
self.urls.add_new_url(root_url)
while self.urls.has_new_url():
try:
new_url=self.urls.get_new_url()
print('crawling URL => %d ... : %s' % (count, new_url))
html_cont=self.downloader.download(new_url)
new_urls, resultlistM,resultlistS=self.parser.parse(new_url,html_cont,resultlistM,resultlistS)
self.urls.add_new_urls(new_urls)
#if count==100:
# break
count=count+1
except Exception as e:
print(e)
print('crawing failure')
#self.output.output_html()
self.output.collect_data(sheet1, resultlistM,name,num)
self.output.collect_data(sheet2, resultlistS, name, num)
if __name__=="__main__":
wb = xl