本文主要记录关于慕课网上的课程练习,由于时间已久,百度股票网已经不能再用了,因此参考[1]使用的股票网址股城网。
法1 采用requests库
思路:从初始网页获取个股代码,然后获取个股的交易信息,为了方便起见,我将获取的个股信息直接存储在csv文件中。代码如下:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.81 Safari/537.36 SE 2.X MetaSr 1.0'
}
def get_html(urls):
"""
用来提取初始页面
"""
try:
html = requests.request('GET', urls, headers=headers)
html.encoding = html.apparent_encoding
# print(html.text)
return html
except:
return ''
def get_urls(response):
"""获取股票列表"""
soup = BeautifulSoup(response.text,'html.parser')
# print(soup)
l = []
a = soup.findAll('a')
for i in a:
try:
href = i.attrs['href']
ls_name = re.findall(r"[S][ZH]\d{6}", href)[0]
l.append(ls_name)
except:
continue
print(l)
return l
def get_information(l):
tf = []
for s in l:
print(s)
l_dict = {}
url = "https://hq.gucheng.com/" + s + '/'
html = requests.request('GET', url, headers=headers)
if html.status_code == 200:
soup = BeautifulSoup(html.text, 'html.parser')
name = soup.find('header', class_='stock_title').text.split('\n')[1]
# 最高最低价格
price = soup.find('dl',class_='s_height').text.split('\n')
# print(price)
# 余下
res = soup.find('div', class_='s_date').text.split('\n')
# l[]
print(res)
# 放入字典中
l_dict['名称'] = name
l_dict['代码'] = s
l_dict[price[1]] = price[2]
l_dict[price[3]] = price[4]
i = 0
while i < len(res):
if res[i] != '':
l_dict[res[i]] = res[i+1]
i += 2
else:
i += 1
print(l_dict)
tf.append(l_dict)
data = pd.DataFrame(tf, columns=tf[0].keys())
#data.to_csv('股票交易信息.csv', index=None, encoding='utf-8')
print(data)
法2 采用scrapy库直接爬取
本方法是由于在使用scrapy框架进行爬取时,无法打印中间过程,导致无法编写代码,为了方便修改bug,才有了这个方式。
import requests
from scrapy.http import TextResponse
import re
headers = {'user-agent': 'Mozilla/5.0'}
start_url = 'https://hq.gucheng.com/gpdmylb.html'
r = requests.request('GET', start_url, headers=headers)
# 通过此命令,才能在后边使用css命令
r_html = TextResponse(body=r.content, url=start_url)
rm = r_html.css('.stock_sub')
urls = rm.css('a::attr(href)').extract()[:50]
for url in urls:
infodict = {}
html = requests.request('GET', url, headers=headers)
resp = TextResponse(body=html.content, url=url)
stockinfo = resp.css('.stock_price.clearfix')
try:
name = stockinfo.css('h3').extract()[0]
# print(name)
print(re.findall(r'>(.*)<', name)[0][:-4])
keylist = stockinfo.css('dt').extract()[:-4]
value = stockinfo.css('dd').extract()[:-4]
# print(value)
# print(keylist)
for t in range(len(keylist)):
key = re.findall(r'>(.*)<', keylist[t])[0]
# print(key)
val = re.findall(r'>(.*)<', value[t])[0]
# print(val)
infodict[key] = val
infodict.update(
{
'股票名称': re.findall(r'>(.*)<', name)[0][:-4]
}
)
except:
continue
print(infodict)