接上一篇文章用QT制作了爬票的gui程序后,接下来实现抓取相关数据展示在gui程序中。在完成功能前先说下python 爬虫经常用到的知识点
import urllib.parse #urllib库为python3自带的库,无须安装
import urllib.request
import urllib3 #需要python3中使用pip install urllib3 进行安装
import requests
from requests.exceptions import ReadTimeout,HTTPError,RequestException
from bs4 import BeautifulSoup #提取html内容的库
#urllib用法示例
data = bytes(urllib.parse.urlencode({
'word':'hello'}), encoding='utf8')
response = urllib.request.urlopen('http://httpbin.org/post', data=data)
print(response.read().decode('utf8'))
#urllib3用法示例
http = urllib3.PoolManager()
url = 'http://www.baidu.com'
response = http.request('GET',url)
print(response.data.decode('utf8'))
#requests.get用法示例,这里是带header头和加cookie的示例
heaed = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'}
cookies = {
'STOKEN_BFESS': '56b3ea70e2d3a03ac3fe0b80145ec3c09d86b69e56d6dbdb1d0501377926de53'}
response = requests.get(url, headers=heaed, cookies=cookies)
resp = requests.get('http://www.baidu.com')
print(resp.status_code)
print(resp.url)
print(resp.headers)
print(resp.cookies)
print(resp.text)
print(resp.content.decode('utf8'))
#requests.post用法示例
posts = {
'username':'hello'}
resp = requests.post('http://httpbin.org/post',data=posts)
print(resp.status_code)
print(resp.url)
print(resp.headers)
print(resp.cookies)
print(resp.text)
print(resp.content.decode('utf8'))
#这里是展示BeautifulSoup简单用法示例
response = requests.get('http://news.baidu.com')
soup = BeautifulSoup(response.text,features='lxml')
print(soup.prettify())
print(soup.find('title').text)
上面示例代码主要介绍了python 爬虫常用到的一些模块,其中requests用法示例中带有header头和cookie相关的使用介绍,因在实际项目中爬取数据经常会被网站屏蔽,故带上header头和客户端cookie可以解决这一问题,cookie的获取方式可以在浏览器中查看
上一篇文章介绍了使用Qt Designer工具制作爬票gui,运行示例如下图
定义一组函数获取12306所有车站名称,命名get_stations.py,并把获取到车站名称保存到文件中
import re
import requests
import os
def getStation():
# 发送请求获取所有车站名称,通过输入的站名称转化查询地址的参数
# url = 'https://kyfw.12306.cn/otn/resources/js/framework/station_name.js?station_version=1.9006'
url = 'https://kyfw.12306.cn/otn/resources/js/framework/station_name.js?station_version=1.9050'
response = requests.get(url, verify=True) # 请求并进行验证
stations = re.findall('([\u4e00-\u9fa5]+)\|([A-Z]+)', response.text) # 获取需要的车站名称
stations = dict((stations), indent=4) # 转换为dic
stations = str(stations) # 转换为字符串类型否则无法写入文件
write(stations) #调用写入方法
def write(stations):
file = open('stations.text', 'w', encoding='utf_8_sig') # 以写模式打开文件
file.write(stations) # 写入数据
file.close()
def read():
file = open('stations.text', 'r', encoding='utf_8_sig') # 以写模式打开文件
data = file.readline() #读取文件
file.close()
return data
def isStations():
isStations = os.path.exists('stations.text') #判断车站文件是否存在
return isStations
接下来定义函数获取站点相关车次信息,并加载query_requests.py文件中的方法,查看12306站点获取车次相关接口,得到结果后分析组装相关数据
import requests
from get_stations import *
data = [] # 用于保存整理好的车次信息
type_data = [] # 保存车次分类后最后的数据
def query(date, from_station, to_station):
data.clear() # 清空数据
type_data.clear() # 清空车次分类保存的数据
# 查询请求地址
url = 'https://kyfw.12306.cn/otn/leftTicket/query?leftTicketDTO.train_date={}&leftTicketDTO.from_station={}&leftTicketDTO.to_station={}&purpose_codes=ADULT'.format(
date, from_station, to_station)
# 发送查询请求
heaed = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'}
cookies = {
'_uab_collina': '160557808500061392861085', 'JSESSIONID': '4529B254DD1DA81A87A8C51E37B0E226',
'RAIL_DEVICEID': 'QJecep2XuCpZ9e7kdVkYpbWmhpiRe6Czk1alTKJSUyNu9KNK8v9USaqbZOK4Nx4uRH8TLzrLzuvwjirgmdYXhYurvlG4m0UWJPEP8Wg2BHOfJ3BUj6UlHxUs01_8cwsmJWIGBKw7k_7c_yqerJ7fq4JOFQUfoiPu'}
response = requests.get(url, headers=heaed, cookies=cookies)
# # 将json数据转换为字典类型,通过键值对取数据
result = response.json()
result = result['data']['result']
if isStations() == True:
stations = eval(read())