1.项目简介
任务:爬取某基金网站数据, 以单页单条模式存储。
性能:抓取各式数据,以dict模式存入数据库,以json模式导出到文本。
最后实现90分钟内对全站约3500支基金数据更新, 约100条error。
2.版块说明
List_page.py
简单抓取列表页面基金代号,方便后面的url队列生成。
Crawl.py
分为Gen, Parse, Clean三个部分。
Gen为url队列生成, Parse做单页解析, Clean做简单的数据清洗。
-库调用:
requests + bs4 + pymongo 为主要的爬取函数
codecs + json 链接文本输出
threading + datetime 控制多线程运行,以及负责进度反馈
-各部分:
Gen:通过生成器函数实现,节省空间,结构也更清晰。
Parse:观察网页结构后,划出相应到版块解析,一方面调理更清晰,方便调试,另一方面则实现了并行。
Clean:在这个项目里并没有太多处理,只是在网页原始数据中把该拆分到部分拆分出来。
3.总结
通过本次项目实践,对于数据采集有了比较全面的认识,其间的各种细节技术都是在实践中学来的。
在项目中做了很多新尝试,比如报错系统和进度反馈,这些可有可无到部分既可以方便调试,也是为了让数据更加可靠。
为了使程序自由度高,没有使用爬虫框架,尽量用外层函数库编写。这样可以根据不同网页情况自己修改。
在多线程控制上仍然存在一些问题,为什么在每个单页解析里都要初始进程list呢,我试过把list放在外面,进程控制会报错。
-对于数据存储结构的心得:
跟据不同需要以及数据本身的特征,我找到了一些简单的存储方式。决定存储方式的当然还是dict以及json本身的格式。
1. dict: {‘attribute’ : …}
2. list: [a, b, c…]
对于一些表格,在不同到需要下有不同到存储方式:
1. dict: {date : {f(d), g(d)}, d : {}, d: {}…}
2. list: [{date, f(d), g(d)}, {}, {}…]
3. multi-list: {date : [d1, d2, d3…], f : [f1, f2, f3…], g : [g1, g2, g3…]}
dict方便对table单行查询,list能保证数据的完整性,multi-list便于对数据可视化。
又丑又长到代码我本是不想贴的, 只是还没找到合适的管理方式。
代码块中的地址为了保护网页权益擦掉了。
import requests
from bs4 import BeautifulSoup
import pymongo
import codecs
import json
import datetime
import threading
def Gen():
global dataframe
global home
tit = 0
file = codecs.open('fund.json', 'r', 'utf-8')
tmp = json.load(file)
file.close
datagram = tmp.get('fund')
for dataframe in datagram:
home = {}
code = dataframe.get('code')
tit += 1
if tit % 100 == 0:
now = datetime.datetime.now()
print(tit, ' / ', len(datagram))
print(now.strftime('%Y-%m-%d %H:%M:%S'))
print(str(tit * 100 / len(datagram)) + '%')
#input()
url = ...
yield url
def Getsoup(url):
res = requests.get(url)
res.encoding = 'utf-8'
html = res.text
soup = BeautifulSoup(html, 'html.parser')
return soup
def Part01():
#01---infoOfItem---
global soup
global home
global eros
item = {}
ks = []
vs = []
try:
head = soup.find(attrs={'class': 'fundInfoItem'})
except:
print('Error 01: at', dataframe.get('code'))
return
#01.1-infoOfFund
info = {}
try:
div = head.find(attrs={'class' : 'infoOfFund'})
table = div.find_all('td')
for i in range(5):
td = table[i]
tmp = str(td.get_text())
p = tmp.find(':')
tmp = tmp[p + 1 : len(tmp)].replace('\xa0', '')
if i == 0:
info['type'] = tmp
if i == 1:
info['scale'] = tmp
if i == 2:
info['manager'] = tmp
if i == 3:
info['est_date'] = tmp
if i == 4:
info['damin'] = tmp
ks.append('info')
vs.append(info)
except:
print('Error 01.1: at' + dataframe.get('code'))
flag = False
#01.2-dataOfFund
data = {}
try:
div = head.find(attrs={'class' : 'dataOfFund'})
table = div.find_all('dd')
for i in range(9):
dd = table[i]
tmp = str(dd.get_text())
p = tmp.find(':')
tmp = tmp[p + 1: len(tmp)].replace('\xa0', '')
if i == 0:
data['estimation'] = tmp
if i == 1:
data['1month'] = tmp
if i == 2:
data['1year'] = tmp
if i == 3:
data['unit_net'] = tmp
if i == 4:
data['3month'] = tmp
if i == 5:
data['3year'] = tmp
if i == 6:
data['accum_net'] = tmp
if i == 7:
data['6month'] = tmp
if i == 8:
data['since_est'] = tmp
ks.append('data')
vs.append(data)
flag = True
except:
pass
#01.3-dataOfFund_hb
if flag == False:
data = {}
try:
div = head
table = div.find_all('dd')
for i in range(10):
dd = table[i]
tmp = str(dd.get_text())
p = tmp.find(':')
tmp = tmp[p + 1: len(tmp)].replace('\xa0', '')
if i == 0:
data['per_million'] = tmp
if i == 1:
data['7day'] = tmp
if i == 2:
data['14day'] = tmp
if i == 3:
data['28day'] = tmp
if i == 4:
data['1month'] = tmp
if i == 5:
data['1year'] = tmp
if i == 6:
data['3month'] = tmp
if i == 7:
data['3year'] = tmp
if i == 8:
data['6month'] = tmp
if i == 9:
data['since_est'] = tmp
ks.append('data')
vs.append(data)
flag = True
except:
pass
if flag == False:
eros += 1
print('Error 01.2/3: at' + dataframe.get('code'))
#01------
for i in range(len(ks)):
item[ks[i]] = vs[i]
home['item'] = item
def Part02():
#02---historyReturnRate---
global soup
global home
global eros
history = {}
flag = False
#02.1-Monetary
try:
head = soup.find(attrs={'id': 'historyReturnRate'})
table = head.find_all('tr')
date = []
per_million = []
seven_day = []
for i in range(len(table)):
if i == 0:
continue
tr = table[i]
date.append(tr.find_all('td')[0].get_text())
per_million.append(tr.find_all('td')[1].get_text())
seven_day.append(tr.find_all('td')[2].get_text())
history['date'] = date
history['per_million'] = per_million
history['7day'] = seven_day
home['history'] = history
flag = True
except:
pass
if flag == True:
return
#02.2-stock
try:
head = soup.find(attrs={'id': 'Div2'})
table = head.find('table')
table = table.find_all('tr')
date = []
unit_net = []
accum_net = []
rate = []
for i in range(len(table)):
if i == 0:
continue
tr = table[i]
date.append(tr.find_all('td')[0].get_text())
unit_net.append(tr.find_all('td')[1].get_text())
accum_net.append(tr.find_all('td')[2].get_text())
rate.append(tr.find_all('td')[3].span.get_text())
history['date'] = date
history['unit_net'] = unit_net
history['accum_net'] = accum_net
history['rate'] = rate
home['history'] = history
flag = True
except:
pass
if flag == False:
eros += 1
print('Error 02: at' + dataframe.get('code'))
#02------
def Part03():
#03---IncreaseAmount---
global soup
global home
global eros
increase = []
period = []
inc = []
avg = []
hs300 = []
rank = []
f = False
try:
head = soup.find(attrs={'class': 'IncreaseAmount'})
table = head.find_all('tr')
except:
f = True
if f:
return
try:
for i in range(5):
tr = table[i]
if i == 0:
cols= tr.find_all('th')
for th in cols[1: len(cols)]:
period.append(th.get_text())
else:
cols = tr.find_all('td')
for td in cols[1 : len(cols)]:
if i == 1:
inc.append(td.get_text())
if i == 2:
avg.append(td.get_text())
if i == 3:
hs300.append(td.get_text())
if i == 4:
rank.append(td.get_text())
for i in range(len(period)):
tmp = {}
tmp['period'] = period[i]
tmp['inc'] = inc[i]
tmp['avg'] = avg[i]
tmp['hs300'] = hs300[i]
tmp['rank'] = rank[i]
increase.append(tmp)
home['increase'] = increase
except:
eros += 1
print('Error 03 at:' + dataframe.get('code'))
#03------
def Parse():
global home
global col
global partision
#00***fundInfoItem***
home['fund'] = {'name' : dataframe.get('name'), 'code' : dataframe.get('code')}
for thread in partision:
thread.setDaemon(True)
thread.start()
for thread in partision:
thread.join()
def Clean():
tmp = str(home.get('item').get('info').get('type'))
p = tmp.find('|')
a = tmp[:p]
b = tmp[p+1:]
home['item']['info']['type'] = {'a' : a, 'b' : b}
tmp = str(home.get('item').get('info').get('scale'))
p = tmp.find('(')
num = tmp[:p]
date = tmp[p+1:len(tmp) - 1]
home['item']['info']['scale'] = {'num' : num, 'date' : date}
file_w = codecs.open('...', 'w', 'utf-8')
eros = 0
#col = pymongo.MongoClient('localhost', 27017).Easy2.Easy2
for url in Gen():
partision = []
p1 = threading.Thread(target=Part01)
partision.append(p1)
p2 = threading.Thread(target=Part02)
partision.append(p2)
p3 = threading.Thread(target=Part03)
partision.append(p3)
#!Caution : I tried to move it out of the loop, But it seems like not workable outside...
soup = Getsoup(url)
Parse()
Clean()
# col.insert(dict(home))
file_w.write(str(json.dumps(home, ensure_ascii=False, indent=4)) + '\n')
print('Errors in tot: %d' %eros)
file_w.close()