小甲鱼python视频教程053~055论一只爬虫的自我修养

论一只爬虫的自我修养

Python如何访问互联网

URL+lib》》》urllib

URL的一般格式为(带方括号[]的为可选项)

Protocol://hostname[:port]/path/[;parameters][?query]#fragment

 

URL由三部分组成:

---第一部分是协议:http,https,ftp,file,ed2k…

---第二部分是存放资源的服务器的域名系统或IP地址(有时候要包含端口号,各种传输协议都有默认的端口号,如http的默认端口为80)。

>>> import urllib.request

>>> response =urllib.request.urlopen("http://www.fishc.com")

>>> html = response.read()

>>> print(html)

 解码操作:

>>> html =html.decode("utf-8")

>>> print(html)

 

实战

http://placekitten.com/

http://placekitten.com/g/500/600

 

import urllib.request

 

response =urllib.request.urlopen('http://placekitten.com/g/500/600')#http://photocdn.sohu.com/20160928/Img469362391.jpg就是图片的地址

#req urllib.request.Request("http://placekitten.com/g/500/600")

#response = urllib.request.urlopen(req)都可以的

cat_img = response.read()

 

with open('cat_500_600.jpg', 'wb')  as f:

f.write(cat_img)

 

import urllib.request

 

req =urllib.request.Request('http://placekitten.com/g/500/600')

response = urllib.request.urlopen(req)

cat_img = response.read()

 

with open('cat_500_600_700.jpg', 'wb')  as f:

f.write(cat_img)

 

>>>response.geturl()

'http://placekitten.com/g/500/600'

>>>response.info()

<http.client.HTTPMessageobject at 0x032903B0>

>>>print(response.info())

Date: Fri, 23Sep 2016 12:08:01 GMT

Content-Type:image/jpeg

Content-Length:26590

Connection:close

Set-Cookie:__cfduid=d597b1c0153223d2cc087dc8f0047e92e1474632481; expires=Sat, 23-Sep-1712:08:01 GMT; path=/; domain=.placekitten.com; HttpOnly

Accept-Ranges:bytes

X-Powered-By:PleskLin

Access-Control-Allow-Origin:*

Cache-Control:public

Expires: Thu,31 Dec 2020 20:00:00 GMT

Server:cloudflare-nginx

CF-RAY:2e6de12fe1b822b2-LAX

 

 

>>>response.getcode()

200

>>> 

 

爬虫有道词典

importurllib.request

importurllib.parse

import json

 

content =input("输入需要翻译的内容:")

 

url='http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&smartresult=ugc&sessionFrom=dict2.index'

 

data = {}

data['type']='AUTO'

data['i']=content

data['doctype']='json'

data['xmlVersion']='1.8'

data['keyfrom']='fanyi.web'

data['ue']='UTF-8'

#action:FY_BY_CLICKBUTTON

data['typoResult']= 'true'

data =urllib.parse.urlencode(data).encode('utf-8')

 

response =urllib.request.urlopen(url,data)

html =response.read().decode('utf-8')

 

target =json.loads(html)

#print("翻译结果:%s"% (target['translateResult'][0][0]['tgt']))

Target =target[‘translateResult’][0][0][‘tgt’]

Print(target)

 

隐藏

importurllib.request

importurllib.parse

import json

 

content =input("输入需要翻译的内容:")

 

url='http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&smartresult=ugc&sessionFrom=dict2.index'

 

'''

head = {}

head['User-Agent']= 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/53.0.2785.116 Safari/537.36'

'''

 

data = {}

data['type']='AUTO'

data['i']=content

data['doctype']='json'

data['xmlVersion']='1.8'

data['keyfrom']='fanyi.web'

data['ue']='UTF-8'

#action:FY_BY_CLICKBUTTON

data['typoResult']= 'true'

data =urllib.parse.urlencode(data).encode('utf-8')

 

req =urllib.request.Request(url,data)

req.add_header('User-Agent','Mozilla/5.0(Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116Safari/537.36')

 

response =urllib.request.urlopen(url,data)

html =response.read().decode('utf-8')

 

target =json.loads(html)

#print("翻译结果:%s"% (target['translateResult'][0][0]['tgt']))

target =target['translateResult'][0][0]['tgt']

print(target)

 

修改header

通过request的headers参数修改

通过request.add_header()方法修改

 

import urllib.request
import urllib.parse
import json
import time


while True:
    content = input("请输入需要翻译的内容(输入q!退出程序。):")
    if content == 'q!':
        break
  
    
    url = "http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&smartresult=ugc&sessionFrom=null"
    '''
    head = {}
    head['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36"
    '''
    data = {}


    data['type'] = 'AUTO'
    data['i'] = content
    data['doctype']= 'json'
    data['xmlVersion'] ='1.8'
    data['keyfrom']='fanyi.web'
    data['ue']='UTF-8'
    data['action']='FY_BY_CLICKBUTTON'
    data['typoResult']='true'


    data = urllib.parse.urlencode(data).encode('utf-8')


    #req = urllib.request.Request(url,data,head)
    req = urllib.request.Request(url,data)
    req.add_header('User-Agent',"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36")


    response = urllib.request.urlopen(req)
    html = response.read().decode('utf-8')






    target = json.loads(html)
    print("翻译结果:%s" % (target['translateResult'][0][0]['tgt']))


    print(target)
    time.sleep(5)

 

代理

步骤:

1.      参数是一个字典{‘类型’:‘代理IP:端口号’}

   Proxy_support =urllib.request.ProxyHandler({})

2.      定制、创建一个opener

Opener = urllib.request.build_opener(proxy_suuport)

3a.安装opener

Urllib.request.install_opener(opener)

     3b.调用opener

           Opener.open(url)

importurllib.request

 

 

import urllib.request
 
 
url='http://www.whatismyip.com.tw'
 
proxy_support= urllib.request.ProxyHandler({'http':'58.67.159.50:80'})
 
opener =urllib.request.build_opener(proxy_support)
opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36')]

urllib.request.install_opener(opener)
 
response= urllib.request.urlopen(url)
 
html =response.read().decode('utf-8')
 
print(html)

 

 

多个ip地址:

importurllib.request

importrandom

 

 

url='http://www.whatismyip.com.tw'

 

iplist =['101.254.188.198:8080','114.245.9.116:8118','139.217.8.125:80']

 

proxy_support= urllib.request.ProxyHandler({'http':random.choice(iplist)})

 

opener =urllib.request.build_opener(proxy_support)

opener.addheaders= [('User-Agent','Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, likeGecko) Chrome/53.0.2785.116 Safari/537.36')]

 

urllib.request.install_opener(opener)

 

response= urllib.request.urlopen(url)

 

html =response.read().decode('utf-8')

 

print(html)

 

运行的结果:

>>>================================ RESTART ================================

>>>

 

 

<html>

  <head>

    <metahttp-equiv="Content-Type" content="text/html;charset=utf-8"/>

    <meta name="description"content="我的IP查詢"/>

    <meta name="keywords"content="查ip,ip查詢,查我的ip,我的ip位址,我的ip位置,偵測我的ip,查詢我的ip,查看我的ip,顯示我的ip,what is my IP,whatismyip,my IP address,my IP proxy"/>

    <title>我的IP位址查詢</title>

  </head>

  <body>

<h1>IP位址</h1><h2>101.254.188.198</h2><h1>真實IP</h1><h2>222.182.98.44</h2>

 

<scripttype="text/javascript">

varsc_project=6392240;

varsc_invisible=1;

varsc_security="65d86b9d";

varscJsHost = (("https:" == document.location.protocol) ?"https://secure." : "http://www.");

document.write("<sc"+"ripttype='text/javascript' src='" + scJsHost +"statcounter.com/counter/counter.js'></"+"script>");

</script>

<noscript><divclass="statcounter"><a title="websitestatistics"href="http://statcounter.com/" target="_blank"><imgclass="statcounter"  src="http://c.statcounter.com/6392240/0/65d86b9d/1/"alt="website statistics"></a></div></noscript>

 

  </body>

</html>

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值