爬虫是在师兄写的基础上改的,师兄的原本代码是爬取所有股票的所有新闻,我修改了下,爬取指定日期的新闻:
# -*- coding: UTF-8 -*-
import time,urllib2,urllib,StringIO,sys,os,multiprocessing,sqlite3
from datetime import datetime
def utf8(i):
return i.decode('gbk','ignore').encode('utf8')
def getNews(stockName):
print stockName,'Doing'
pages = range(1,3)
pages = [str(p) for p in pages]
check = '鏆傛椂娌℃湁鏁版嵁'
check = check.decode('utf8').encode('gbk')
for stock in [stockName]:
date='20130319'
stockFile = "../StockData/News4Daily/News" + stock + "_"+date+".txt"
fp = file(stockFile,'w')
for page in pages:
print '0',stock
stockUrl = "http://money.finance.sina.com.cn/corp/view/vCB_AllNewsStock.php?symbol=" + stock + "&Page=" + page
try:
stockWeb = urllib.urlopen(stockUrl).read()
if check in stockWeb:
break
except:
print 'StockError',stock,page
continue
print '1',stock
stockWeb=stockWeb.split('\n')
stockflag=0
for f in stockWeb:
if stockflag==1:
f=f.split('<a')
for i in f:
if '20130319' in i:
startUrl=i.find('http')
endUrl=i.find('shtml')
newsUrl = i[startUrl:endUrl+5]
try:
newsWeb = urllib.urlopen(newsUrl).read()
except:
print 'NewsError',stock,page
continue
print '2',stock
newsWeb = newsWeb.split('\n')
newsFlag = 0
news = ""
for text in newsWeb:
if newsFlag == 1:
news += text
if 'publish_helper name' in text:
newsFlag = 1
if ('news_keyword_pub' in text or '<div' in text):
newsFlag = 0
while ('<' in news and '>' in news):
start = news.find('<')
end = news.find('>')
if start<end:
news = news[:start] + news[end+1:]
else:
news = news[:end] + news[end+1:]
news = news.replace(' ','')
news = news.replace('\t','')
news = news.replace('\xa1\xa1\xa1\xa1','')
news = news.replace(' ','')
fp.write(utf8(news))
print '3',stock
fp.write('\n')
break
if 'datelist' in f:
stockflag = 1
fp.close()
print '4',stock
return
if __name__ == '__main__':
print 'GetdailyNews Start'
stocks = open('../StockData/NameList.txt').read().split('\n')
if not os.path.exists('../StockData/News4Daily'):
os.mkdir('../StockData/News4Daily')
pool = multiprocessing.Pool(processes=4) # MultiProcessor
for stock in stocks:
pool.apply_async(getNews,(stock,))
pool.close()
pool.join()
print 'GetdailyNews Done'