爬虫之urllib库以及requests库的使用说明

import urllib.request
#######一 urllib库的使用#####
# 里面有三个常用的模板,分别是requeat,error,parse.,其中resqust里面就有urlopen,以及Resqust
'''
#课本p103,最简单的通过get方式爬取网页的方法,就是使用urlopen方法
response=urllib.request.urlopen('https://nba.hupu.com/',timeout=5)   #可以在后面设置超时的时间设定
print(response.read().decode('utf-8'))

#这里使用Request方法
url=("https://nba.hupu.com/")
req=urllib.request.Request(url)
response=urllib.request.urlopen(req)
print(response.status)   #查看响应的状态码
html=(response.read().decode('utf-8'))
print(html)
print(type(html))   #查看html的类型


from urllib import request,parse
url="https://httpbin.org/post"
#构建一个请求头
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36",
"Host":"httpbin.org"
}
dict={
    "name":"harden"
}
data=bytes(parse.urlencode(dict),encoding='utf8')     #输入的数据需要被转码成bytes(字节流)
req=request.Request(url,headers=headers,data=data,method='POST')
response=request.urlopen(req)
html=(response.read().decode('utf8'))
print(html)



#使用代理p110
from urllib.request import ProxyHandler,build_opener
from urllib.error import URLError

proxy_handler=ProxyHandler({
    'http':'http;//127.0.0.1:9743',
    'https':'https://127.0.0.1:9743'
})
opener=build_opener(proxy_handler)
try:
    response=opener.open("https://www.baidu.com/")
    print(response.read().decode('utf8'))
except URLError as e:
    print(e.reason)


#cookies的处理方法
import http.cookiejar,urllib.request

cookie=http.cookiejar.CookieJar()
handler=urllib.request.HTTPCookieProcessor(cookie)
opener=urllib.request.build_opener(handler)
response=opener.open('https://www.baidu.com')
#print(response.read().decode('utf8'))
for item in cookie:
    print(item.name+"="+item.value)
'''

# parse模板的使用
'''

#urlparse的使用,解析url用的
from urllib.parse import urlparse
a=urlparse('https://bbs.hupu.com/22236262.html')
for i in range(4):
    print(a[i])

'''



#######requests库的使用#####  注意不是request
''''
import requests
a=requests.get('https://bbs.hupu.com/22236262.html')
print(a.text)
print(a.cookies)
'''


#下面爬取知乎发现页面的信息
'''
import requests
import re
#
headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'
}
r=requests.get('https://www.zhihu.com/explore',headers=headers)
print(r.text)
print(type(r.text))
'''

#获取cooki  p130
'''import requests
r=requests.get("https://www.baidu.com")
#print(r.text)
print(r.cookies)
print(type(r.cookies))
'''

'''
#让cookies维持登录的状态   p131
import requests
headers={
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
    'cookie':'q_c1=0393d4a0ae6245cba2d113da4fc9c11b|1526092090000|1526092090000; _zap=3a66108d-04da-47b0-9ecb-b05ad8dc1828; _xsrf=c2e55ac7-29d3-4b89-b244-fed51770e9d2; l_n_c=1; n_c=1; l_cap_id="ZWMxMTkzYTY0ZDM0NGNiM2E2ODMwNjgwMjZjZjZhY2U=|1526176928|e31768819b294ddb8aa9524ee3c270bef243056c"; r_cap_id="MTc4Y2NhYTA1Y2RmNDQ3MzhjM2M4ZDdmYTMxNjQ0MGM=|1526176928|326b63ff464f2fa461bc7e1eb824d93cf74065f9"; cap_id="YWYxZWEyZWI0MzA0NDkyZDhkNzgyZGEwYjRlMDhjMjM=|1526176928|88d28b14bcc26d12d6a4d042521f9ab8adc468bf"; d_c0="AFAhh6LilQ2PTlJZy_n_4rERqeJpKtKvHuU=|1526176931"; __utma=51854390.1977582498.1526176934.1526176934.1526176934.1; __utmc=51854390; __utmz=51854390.1526176934.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmv=51854390.000--|3=entry_date=20180512=1; tgw_l7_route=e0a07617c1a38385364125951b19eef8; capsion_ticket="2|1:0|10:1526182354|14:capsion_ticket|44:MTg4MmE0Zjk1NmI4NDdmOWE4ZjI4NjE4ZDA4NGRlNjI=|0fe59b0b4e6611cf1a99851637cc93ceecd62267aeab0ec18b2c05ebe23c5338"; z_c0="2|1:0|10:1526182363|4:z_c0|92:Mi4xSGZsV0F3QUFBQUFBVUNHSG91S1ZEU1lBQUFCZ0FsVk4yX19rV3dCLTFSQjRVSTh4YkhTeFU2SHI5RHpYd05UYXNn|cf51057a878911843149efceeb32a706ca6588e6eb7ed00459f65dab38a21b3b"'
}

r=requests.get('https://www.zhihu.com/',headers=headers)
print(r.text)
'''

#测试ssl的证书验证问题
import requests
a=requests.get("https://music.douban.com/chart")
print(a.text)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值