python spider是什么_Python Spider入门

添加header和data

request=urllib.request.Request(url)

response=urllib.request.urlopen(request)

#response是一个http.client.HTTPResponse对象

print(response.geturl()) #获取网址

print(response.info()) #获取信息

print(response.getcode()) # http状态码

html=response.read()

urllib.request.Request(url, data=None, headers={}, origin_req_host=None, unverifiable=False, method=None)

1.data参数:the HTTP request will be a POST instead of a GET when the data parameter is provided.data should be a buffer in the standard application/x-www-form-urlencoded format. The urllib.parse.urlencode() function takes a mapping or sequence of 2-tuples and returns a string in this format.

data={}

data['type']='AUTO'

data['i']=content

data['doctype']='json'

data['xmlVersion']=1.8

data['keyfrom']='fanyi.web'

data['ue']='UTF-8'

data['action']='FY_BY_CLICKBUTTON'

data['typoResult']='true'

data=urllib.parse.urlencode(data).encode('utf-8')

2.headers:一个字典,可以直接从浏览器中复制过来

header={}

#添加header的第一种方法

header['User-Agent']='Mozilla/5.0 (Windows NT 10.0; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0'

request=urllib.request.Request(url,data,header)

#也可使用以下方法

request.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0')

获取Response Headers

下面代码是用来判断 网页是否使用gzip压缩过。

for i in response.getheaders():

if i[0]=="Content-Encoding":

if(i[-1]=="gzip"):

html=gzip.decompress(html)

使用代理

1.参数是一个字典{‘类型’:‘代理ip:端口号’}

proxy_support=urllib.request.ProxyHandler({})

2.定制、创建一个openner

opener=urllib.request.build_opener(proxy_support)

3a.安装openerurllib.request.install_opener(opener)3b.调用openeropener.open(url)`

代码

proxy_support=urllib.request.ProxyHandler({'http':random.choice(iplist)})

opener=urllib.request.build_opener(proxy_support)

opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0')]

urllib.request.install_opener(opener)

req=urllib.request.Request(url)

response=urllib.request.urlopen(req)

爬知乎图片

import urllib.request

import os

import random

#打开网页

def url_open(url):

iplist=[

'49.77.22.1:8118',

'58.134.102.3:12696',

'120.26.213.55:9999'...]

proxy_support=urllib.request.ProxyHandler({'http':random.choice(iplist)})

opener=urllib.request.build_opener(proxy_support)

opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0')]

urllib.request.install_opener(opener)

req=urllib.request.Request(url)

response=urllib.request.urlopen(req)

html=response.read()

return html

#获取图片地址,返回图片地址的list

def get_imgs(url):

html=url_open(url).decode('utf-8')

img_address=[]

a=html.find('data-original')

while(a!=-1):

b=html.find('.jpg',a,a+300)

if(b!=-1):

# print(html[a+15:b+4])

img_address.append(html[a+15:b+4])

else:

b=a+9

a=html.find('data-original=',b)

for i in img_address:

print(i)

return img_address

#存储到本地

def save_imgs(img_address):

for i in img_address:

# print(i)

filename=i.split('/')[-1]

with open(filename,'wb') as f:

img=url_open(i)

f.write(img)

def zhihuPic(url,folder="zhihu"):

if(os.path.exists(folder)):

os.chdir(folder)

else:

os.mkdir(folder)

os.chdir(folder)

img_address=get_imgs(url)

save_imgs(img_address)

if __name__=='__main__':

zhihuPic("https://www.zhihu.com/question/22070147")

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值