下面的程序是可以正常运行的,已经做过调试,重要的是针对返回结果做筛选和保存
HTTPS POST之后的响应格式不是XML
# coding=utf-8
import requestsimport re
from lxml import etree
import time
import pandas as pd
import urllib
import httplib
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
user_agent = 'User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)'
headers = {'User-Agent':user_agent}
#定义一个爬虫
class spider(object):
def __init__(self):
print u'开始爬取内容。。。'
#getsource用来获取网页源代码
def getsource(self,url):
html = requests.get(url)
return html.text
#changepage用来生产不同页数的链接
def changepage(self,url,total_page):
now_page = int(re.search('page=(\d+)',url,re.S).group(1)) #可修改
page_group = []
for i in range(now_page,total_page+1):
link = re.sub('page=\d+','page=%s'%i,url,re.S) #可修改
page_group.append(link)
return page_group
#getpic用来爬取一个网页图片
def getpic(self,source):
selector = etree.HTML(source)
pic_url = selector.xpath('//ul[@class="ali"]/li/div/a/img/@src') #可修改
return pic_url
#savepic用来保存结果到pic文件夹中
def savepic(self,pic_url):
picname=re.findall('(\d+)',link,re.S) #可修改
picnamestr = ''.join(picname)
i=0
for each in pic_url:
print 'now downloading:' + each
pic = requests.get(each)
fp = open('pic\\'+picnamestr +'-'+str(i)+ '.jpg', 'wb')
fp.write(pic.content)
fp.close()
i += 1
#ppic集合类的方法
def ppic(self, link):
print u'正在处理页面:' + link
html = picspider.getsource(link)
pic_url = picspider.getpic(html)
picspider.savepic(pic_url)
def get_info(self,links):
game_name=[]
deposit=[]
period=[]
rebate=[]
for y in links:
try:
r2=requests.get(y,headers=headers)
except:
print ('wrong %s' % main_url)
else:
s2=BeautifulSoup(r2.text,'lxml')
rate=s2.find_all('dd')
rate1=s2.find_all('span')
print "---begin to print rate---"
for i in range(0,len(rate)/5-1):
game_name.append(rate[i*5].string)
deposit.append(rate1[i*3].string)
period.append(rate1[i*3+1].string)
rebate.append(rate1[i*3+2].string)
p={'游戏名称':game_name,
'押金':deposit,
'周期':period,
'返利':rebate}
return p
time1=time.time()
if __name__ == '__main__':
url = 'http://www.wowpower.com/showNewGame?sort=looks&sortseq=down&page=1' #可修改
headers = {"Content-type": "application/x-www-form-urlencoded; charset=UTF-8","Accept": "*/*","origin":"https://agent.qbao.com","referer":"https://agent.qbao.com/agent/home"}
#params = {"orderMode":"0","taskType":"-1","rows":"16","page":"1"}
params = {"orderMode":0,"taskType":-1,"rows":16,"page":1}
data = urllib.urlencode(params)
httpsConn = httplib.HTTPSConnection("www.baidu.com")
httpsConn.request("GET", "/")
res = httpsConn.getresponse()
print res.status, res.reason, res.read()
'''
#host = 'agent.qbao.com'
host = 'www.baidu.com'
#url = '/agent/web/hall'
url = '/'
conn = httplib.HTTPSConnection(host)
print conn
conn.request('POST', url, data, headers)
response = conn.getresponse()
print response.status
print response.reason
print response.read()
print response.getheaders()
'''
#获取头信息
'''
picspider = spider()
all_links = picspider.changepage(url,11) #可修改
print all_links
p=pd.DataFrame(picspider.get_info(all_links))
p.to_csv('e://rate1.csv',index=False,encoding='gbk',
columns=['游戏名称','押金','周期','返利'],
header=['游戏名称','押金','周期','返利'])
for link in all_links:
picspider.ppic(link)
'''
time2=time.time()
print u'耗时:'+str(time2-time1)
因为https post返回的响应消息是json格式的:所以找了一下json的处理
Python的json模块提供了一种很简单的方式来编码和解码JSON数据。 其中两个主要的函数是 json.dumps() 和 json.loads() , 要比其他序列化函数库如pickle的接口少得多。 下面演示如何将一个Python数据结构转换为JSON:
1
2
3
4
5
6
7
8
9
|
import
json
data
=
{
'name'
:
'ACME'
,
'shares'
:
100
,
'price'
:
542.23
}
json_str
&#
|