一个爬取股票信息的爬虫程序

最新推荐文章于 2024-04-24 15:45:00 发布

区块链专家

最新推荐文章于 2024-04-24 15:45:00 发布

阅读量937

点赞数

分类专栏： java

原文链接：http://xcjcy.org/

版权

java 专栏收录该内容

52 篇文章 2 订阅

订阅专栏

1 # * coding:utf-8 *
2
3 import requests,re,json,time,os
4 import heapq
5 from bs4 import BeautifulSoup
6
7 class GPINFO(object):
8 “”“docstring for GPINFO”""
9 def init(self):
10 self.Url = ‘http://http://xcjcy.org/’
11 self.BaseData = []
12 self.Date = time.strftime(’%Y%m%d’)
13 self.Record = ‘basedata’+self.Date
14 if os.path.exists(self.Record):
15 print (‘record exist…’)
16 self.BaseData = self.get_base_data_from_record()
17 else:
18 print (‘fuck-get data again…’)
19 self.get_data()
20
21 def write_record(self,text):
22 with open(self.Record,‘ab’) as f:
23 f.write((text+’\n’).encode(‘utf-8’))
24
25 def get_base_data_from_record(self):
26 ll = []
27 with open(self.Record,‘rb’) as f:
28 json_l = f.readlines()
29 for j in json_l:
30 ll.append(json.loads(j.decode(‘utf-8’)))
31 return ll
32
33 def get_data(self):
34 #请求数据
35 orihtml = requests.get(self.Url).content
36 #创建 beautifulsoup 对象
37 soup = BeautifulSoup(orihtml,‘lxml’)
38 #采集每一个股票的信息
39 count = 0
40 for a in soup.find(‘div’,class_=‘quotebody’).find_all(‘a’,{‘target’:‘blank’}):
41 record_d = {}
42 #代号
43 num = a.get_text().split(’(’)[1].strip(’)’)
44 if not (num.startswith(‘00’) or num.startswith(‘60’)):continue #只需要6*/0*
45 record_d[‘num’]=num
46 #名称
47 name = a.get_text().split(’(’)[0]
48 record_d[‘name’]=name
49 #详情页
50 detail_url = a[‘href’]
51 record_d[‘detail_url’]=detail_url
52
53 cwzburl = detail_url
54 #发送请求
55 try:
56 cwzbhtml = requests.get(cwzburl,timeout=30).content
57 except Exception as e:
58 print (‘perhaps timeout:’,e)
59 continue
60 #创建soup对象
61 cwzbsoup = BeautifulSoup(cwzbhtml,‘lxml’)
62
63 #财务指标列表 [浦发银行，总市值净资产净利润市盈率市净率毛利率净利率 ROE] roe:净资产收益率
64 try:
65 cwzb_list = cwzbsoup.find(‘div’,class=‘cwzb’).tbody.tr.get_text().split()
66 except Exception as e:
67 print (‘error:’,e)
68 continue
69 #去除退市股票
70 if ‘-’ not in cwzb_list:
71 record_d[‘data’]=cwzb_list
72 self.BaseData.append(record_d)
73 self.write_record(json.dumps(record_d))
74 count=count+1
75 print (len(self.BaseData))
76
77 def main():
78 test = GPINFO()
79 result = test.BaseData
80 #[浦发银行，总市值净资产净利润市盈率股市行情毛利率净利率 ROE] roe:净资产收益率]
81 top_10 = heapq.nlargest(10,result,key=lambda r:float(r[‘data’][7].strip(’%’)))
82 for i in top_10:
83 print(i[‘data’])
84
85 if name == ‘main’:
86 main()
复制代码

程序主函数部分是为了获取净利率前10名的股票信息,打印结果如下:

复制代码
[‘绵石投资’, ‘52.2亿’, ‘14.0亿’, ‘1.25亿’, ‘30.90’, ‘3.73’, ‘42.25%’, ‘2047.04%’, ‘9.27%’]
[‘国投安信’, ‘556亿’, ‘270亿’, ‘21.1亿’, ‘19.80’, ‘2.12’, ‘5.90%’, ‘487.53%’, ‘7.79%’]
[‘川投能源’, ‘379亿’, ‘202亿’, ‘28.0亿’, ‘10.16’, ‘1.91’, ‘37.01%’, ‘402.64%’, ‘14.58%’]
[‘ST明科’, ‘47.6亿’, ‘9.25亿’, ‘5.11千万’, ‘68.00’, ‘5.14’, ‘2.38%’, ‘345.11%’, ‘5.68%’]
[‘华联控股’, ‘93.6亿’, ‘31.5亿’, ‘4.76亿’, ‘14.54’, ‘3.74’, ‘46.25%’, ‘328.53%’, ‘20.88%’]
[‘上海九百’, ‘68.2亿’, ‘12.3亿’, ‘1.61亿’, ‘31.67’, ‘5.56’, ‘54.00%’, ‘297.99%’, ‘13.21%’]
[‘凯瑞德’, ‘46.7亿’, ‘1.14亿’, ‘3.27千万’, ‘107.10’, ‘40.94’, ‘16.07%’, ‘294.19%’, ‘33.41%’]
[‘鲁信创投’, ‘172亿’, ‘38.6亿’, ‘3.32亿’, ‘38.48’, ‘4.64’, ‘28.67%’, ‘244.43%’, ‘9.26%’]
[‘博闻科技’, ‘35.0亿’, ‘6.56亿’, ‘2.23千万’, ‘117.65’, ‘5.36’, ‘-16.07%’, ‘215.27%’, ‘3.41%’]
[‘万泽股份’, ‘71.8亿’, ‘13.7亿’, ‘6.87千万’, ‘78.38’, ‘5.29’, ‘22.57%’, ‘203.15%’, ‘5.13%’]