python爬虫抓取数据时失败_python爬虫 大佬 请教下 为什么爬取的数据有时能爬到 有时有爬不到, 代码如下:...

import time

import requests

from lxml import etree

import pymysql

class GuPiao_spider():

def __init__(self):

self.headers = {

'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:70.0) Gecko/20100101 Firefox/70.0'

}

self.url = 'http://q.10jqka.com.cn/index/index/board/all/field/zdf/order/desc/page/{}/ajax/1/' # 每页的网址

# self.content_url = 'http://qd.10jqka.com.cn/quote.php?cate=real&type=stock&return=json&callback=showStockData&code=' # 动态网页的网址

def get_pages(self):

'''

获取网站总页数

:return:

'''

url = 'http://q.10jqka.com.cn/index/index/board/all/field/zdf/order/desc/page/1/ajax/1/'

html = requests.get(url, headers=self.headers).content.decode('gbk')

html_path = etree.HTML(html)

page_num = html_path.xpath('//*[@id="m-page"]/span/text()')[0].split('/')[1]

return page_num

def get_content(self, url):

'''

:param url: 爬取每页的源代码,并获取数据

:return: 返回所需数据的列表

'''

html = requests.get(url, headers=self.headers).content.decode('gbk')

html_path = etree.HTML(html)

tr = html_path.xpath('//tbody/tr')

content_list = []

for td in tr:

info = {}

title = td.xpath('td[3]/a/text()')[0]

code = td.xpath('td[2]/a/text()')[0]

li = td.xpath('td/text()')

info['序号'] = li[0]

info['代码'] = code

info['名称'] = title

info['现价'] = li[1]

info['涨跌幅'] = li[2]

info['涨跌'] = li[3]

info['涨速'] = li[4]

info['换手'] = li[5]

info['量比'] = li[6]

info['振幅'] = li[7]

info['成交额'] = li[8]

info['流通股'] = li[9]

info['流动市值'] = li[10]

info['市赢率'] = li[11]

content_list.append(info)

return content_list

# def save_mysql(self,sql, data):

# '''

# 连接mysql数据库,把数据存到数据库中

# :return:

# '''

# conn = pymysql.connect(

# host = 'localhost',

# user = 'root',

# password = '123456',

# port = 3306,

# db = 'test'

# )

# cur = conn.cursor()

# cur.execute(sql,data)

# conn.commit()

def run(self):

sql = 'insert into tonghuashun(序号,代码,名称,现价,涨跌幅,涨跌,涨速,换手,量比,振幅,成交额,流通股,流动市值,市赢率) values (%(序号)s,%(代码)s,%(名称)s,%(现价)s,' \

'%(涨跌幅)s,%(涨跌)s,%(涨速)s,%(换手)s,' \

'%(量比)s,%(振幅)s,%(成交额)s,%(流通股)s,' \

'%(流动市值)s,%(市盈率)s)'

page_num = self.get_pages() # 网站总页数

count = 2 # 用计数器表示爬取的第几页 带入self.url中

while True:

print('正在爬取第{}页数据...............'.format(count))

url = self.url.format(str(count))

# print(url)

content_list = self.get_content(url)

for data in content_list: # 循环列表 得到字典数据

print(data)

# self.save_mysql(sql, data)

count += 1

time.sleep(3)

if count >= int(page_num):

return False

if name == 'main':

spider = GuPiao_spider()

spider.run()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值