import time
import requests
from lxml import etree
import pymysql
class GuPiao_spider():
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:70.0) Gecko/20100101 Firefox/70.0'
}
self.url = 'http://q.10jqka.com.cn/index/index/board/all/field/zdf/order/desc/page/{}/ajax/1/' # 每页的网址
# self.content_url = 'http://qd.10jqka.com.cn/quote.php?cate=real&type=stock&return=json&callback=showStockData&code=' # 动态网页的网址
def get_pages(self):
'''
获取网站总页数
:return:
'''
url = 'http://q.10jqka.com.cn/index/index/board/all/field/zdf/order/desc/page/1/ajax/1/'
html = requests.get(url, headers=self.headers).content.decode('gbk')
html_path = etree.HTML(html)
page_num = html_path.xpath('//*[@id="m-page"]/span/text()')[0].split('/')[1]
return page_num
def get_content(self, url):
'''
:param url: 爬取每页的源代码,并获取数据
:return: 返回所需数据的列表
'''
html = requests.get(url, headers=self.headers).content.decode('gbk')
html_path = etree.HTML(html)
tr = html_path.xpath('//tbody/tr')
content_list = []
for td in tr:
info = {}
title = td.xpath('td[3]/a/text()')[0]
code = td.xpath('td[2]/a/text()')[0]
li = td.xpath('td/text()')
info['序号'] = li[0]
info['代码'] = code
info['名称'] = title
info['现价'] = li[1]
info['涨跌幅'] = li[2]
info['涨跌'] = li[3]
info['涨速'] = li[4]
info['换手'] = li[5]
info['量比'] = li[6]
info['振幅'] = li[7]
info['成交额'] = li[8]
info['流通股'] = li[9]
info['流动市值'] = li[10]
info['市赢率'] = li[11]
content_list.append(info)
return content_list
# def save_mysql(self,sql, data):
# '''
# 连接mysql数据库,把数据存到数据库中
# :return:
# '''
# conn = pymysql.connect(
# host = 'localhost',
# user = 'root',
# password = '123456',
# port = 3306,
# db = 'test'
# )
# cur = conn.cursor()
# cur.execute(sql,data)
# conn.commit()
def run(self):
sql = 'insert into tonghuashun(序号,代码,名称,现价,涨跌幅,涨跌,涨速,换手,量比,振幅,成交额,流通股,流动市值,市赢率) values (%(序号)s,%(代码)s,%(名称)s,%(现价)s,' \
'%(涨跌幅)s,%(涨跌)s,%(涨速)s,%(换手)s,' \
'%(量比)s,%(振幅)s,%(成交额)s,%(流通股)s,' \
'%(流动市值)s,%(市盈率)s)'
page_num = self.get_pages() # 网站总页数
count = 2 # 用计数器表示爬取的第几页 带入self.url中
while True:
print('正在爬取第{}页数据...............'.format(count))
url = self.url.format(str(count))
# print(url)
content_list = self.get_content(url)
for data in content_list: # 循环列表 得到字典数据
print(data)
# self.save_mysql(sql, data)
count += 1
time.sleep(3)
if count >= int(page_num):
return False
if name == 'main':
spider = GuPiao_spider()
spider.run()