import urllib.request #######一 urllib库的使用##### # 里面有三个常用的模板,分别是requeat,error,parse.,其中resqust里面就有urlopen,以及Resqust ''' #课本p103,最简单的通过get方式爬取网页的方法,就是使用urlopen方法 response=urllib.request.urlopen('https://nba.hupu.com/',timeout=5) #可以在后面设置超时的时间设定 print(response.read().decode('utf-8')) #这里使用Request方法 url=("https://nba.hupu.com/") req=urllib.request.Request(url) response=urllib.request.urlopen(req) print(response.status) #查看响应的状态码 html=(response.read().decode('utf-8')) print(html) print(type(html)) #查看html的类型 from urllib import request,parse url="https://httpbin.org/post" #构建一个请求头 headers={ "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36", "Host":"httpbin.org" } dict={ "name":"harden" } data=bytes(parse.urlencode(dict),encoding='utf8') #输入的数据需要被转码成bytes(字节流) req=request.Request(url,headers=headers,data=data,method='POST') response=request.urlopen(req) html=(response.read().decode('utf8')) print(html) #使用代理p110 from urllib.request import ProxyHandler,build_opener from urllib.error import URLError proxy_handler=ProxyHandler({ 'http':'http;//127.0.0.1:9743', 'https':'https://127.0.0.1:9743' }) opener=build_opener(proxy_handler) try: response=opener.open("https://www.baidu.com/") print(response.read().decode('utf8')) except URLError as e: print(e.reason) #cookies的处理方法 import http.cookiejar,urllib.request cookie=http.cookiejar.CookieJar() handler=urllib.request.HTTPCookieProcessor(cookie) opener=urllib.request.build_opener(handler) response=opener.open('https://www.baidu.com') #print(response.read().decode('utf8')) for item in cookie: print(item.name+"="+item.value) ''' # parse模板的使用 ''' #urlparse的使用,解析url用的 from urllib.parse import urlparse a=urlparse('https://bbs.hupu.com/22236262.html') for i in range(4): print(a[i]) ''' #######requests库的使用##### 注意不是request '''' import requests a=requests.get('https://bbs.hupu.com/22236262.html') print(a.text) print(a.cookies) ''' #下面爬取知乎发现页面的信息 ''' import requests import re # headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36' } r=requests.get('https://www.zhihu.com/explore',headers=headers) print(r.text) print(type(r.text)) ''' #获取cooki p130 '''import requests r=requests.get("https://www.baidu.com") #print(r.text) print(r.cookies) print(type(r.cookies)) ''' ''' #让cookies维持登录的状态 p131 import requests headers={ 'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36', 'cookie':'q_c1=0393d4a0ae6245cba2d113da4fc9c11b|1526092090000|1526092090000; _zap=3a66108d-04da-47b0-9ecb-b05ad8dc1828; _xsrf=c2e55ac7-29d3-4b89-b244-fed51770e9d2; l_n_c=1; n_c=1; l_cap_id="ZWMxMTkzYTY0ZDM0NGNiM2E2ODMwNjgwMjZjZjZhY2U=|1526176928|e31768819b294ddb8aa9524ee3c270bef243056c"; r_cap_id="MTc4Y2NhYTA1Y2RmNDQ3MzhjM2M4ZDdmYTMxNjQ0MGM=|1526176928|326b63ff464f2fa461bc7e1eb824d93cf74065f9"; cap_id="YWYxZWEyZWI0MzA0NDkyZDhkNzgyZGEwYjRlMDhjMjM=|1526176928|88d28b14bcc26d12d6a4d042521f9ab8adc468bf"; d_c0="AFAhh6LilQ2PTlJZy_n_4rERqeJpKtKvHuU=|1526176931"; __utma=51854390.1977582498.1526176934.1526176934.1526176934.1; __utmc=51854390; __utmz=51854390.1526176934.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmv=51854390.000--|3=entry_date=20180512=1; tgw_l7_route=e0a07617c1a38385364125951b19eef8; capsion_ticket="2|1:0|10:1526182354|14:capsion_ticket|44:MTg4MmE0Zjk1NmI4NDdmOWE4ZjI4NjE4ZDA4NGRlNjI=|0fe59b0b4e6611cf1a99851637cc93ceecd62267aeab0ec18b2c05ebe23c5338"; z_c0="2|1:0|10:1526182363|4:z_c0|92:Mi4xSGZsV0F3QUFBQUFBVUNHSG91S1ZEU1lBQUFCZ0FsVk4yX19rV3dCLTFSQjRVSTh4YkhTeFU2SHI5RHpYd05UYXNn|cf51057a878911843149efceeb32a706ca6588e6eb7ed00459f65dab38a21b3b"' } r=requests.get('https://www.zhihu.com/',headers=headers) print(r.text) ''' #测试ssl的证书验证问题 import requests a=requests.get("https://music.douban.com/chart") print(a.text)
爬虫之urllib库以及requests库的使用说明
最新推荐文章于 2024-07-18 19:48:40 发布