记录一次爬虫经历

初学python,先记录一次爬虫经历,就作为python的入门训练吧。目标网站采用了动态加载技术。

#-*- coding:utf-8 -*-
import requests
import re
import threading
global headers_for_pc,headers_for_realurl,offset_for_pc,forbidden
offset_for_pc=0
forbidden=["xxxxxxx",
"xxxxxxx",
"xxxxxx",
"xxxxxxx"]
headers_for_pc={
'Accept':'*/*',
'Accept-Encoding':'gzip, deflate, sdch',
'Accept-Language':'zh-CN,zh;q=0.8',
'Cookie':'xxxxx'
'Host':'aps.115.com',
'Referer':'http://aps.115.com/bridge_2.0.html?xxxxx',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'X-Requested-With':'XMLHttpRequest'
}
headers_for_realurl={
'Accept':'*/*',
'Accept-Encoding':'gzip, deflate, sdch',
'Accept-Language':'zh-CN,zh;q=0.8',
'Cookie':'xxxxx'
'Host':'web.api.115.com',
'Referer':'http://web.api.115.com/bridge_2.0.html?xxxxxx',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'X-Requested-With':'XMLHttpRequest'
}
url_for_pc="http://aps.115.com/natsort/files.php?xxxxxx"
url_for_realurl="http://web.api.115.com/xxxxxx"
def getpc(url,offset):
    response=requests.get(url,params="offset=%s"%(offset),headers=headers_for_pc)
    if response.status_code==200:
        #print response.url
        html=response.text
        pickcodes=re.findall(r'"pc":"(.*?)"',html)
        return pickcodes
    else:
        print "Sory,Get Pickcodes Fail,ErrorCode:",reponse.status_code
        return -1
def geturl(url,pickcode):
    #print pickcode
    response=requests.get(url,params="pickcode="+pickcode,headers=headers_for_realurl)
    #print response.url
    if response.status_code==200:
        html=response.text
        #print html
        realurl=re.findall(r'"file_url":"(.*?)"',html)
        #name=str(re.findall(r'"file_name":"(.*?)"',html)[0])
        return realurl
    else:
        print "Sory,Get Realurl Fail,Errorcode",response.status_code
        return -1
def getpic(url,name):
    #print "name=",name
    f=open("%s"%(name),"wb")
    f.write(requests.get(url).content)
    f.close()
    #print name,"-->done"
def work(offset):
    offset="%s"%(offset)
    print offset
    pcs=getpc(url_for_pc,offset)
    if pcs!=-1:
        for pc in pcs:
            if pc not in forbidden:
                #print pc
                url=geturl(url_for_realurl,pc)
                getpic(str(url[0]).replace("\\",""),pc)

for i in range(0,197,24):
    td=threading.Thread(target=work,args=(i))
    td.run()
print "done"


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值