基本使用
import urllib.request
url = "https://ssr1.scrape.center/"
response = urllib.request.urlopen(url)
content = response.read().decode('utf-8')
print(content)
一个类型和三个方法
"""
一个类型和三个方法
Author:binxin
Date:2023/11/19 18:41
"""
import urllib.request
url = "https://ssr1.scrape.center/"
response = urllib.request.urlopen(url)
print(type(response))
print(response.getheaders())
下载
"""
下载
Author:binxin
Date:2023/11/20 20:19
"""
import urllib.request
url_video = 'https://vd2.bdstatic.com/mda-pkjg3j1629re4z2h/720p/h264/1700480141426778560/mda-pkjg3j1629re4z2h.mp4?v_from_s=hkapp-haokan-hbe&auth_key=1700494480-0-0-98fb46f2a2d69b62592d1344d6ee60b0&bcevod_channel=searchbox_feed&pd=1&cr=2&cd=0&pt=3&logid=2080435259&vid=15565599946852966896&klogid=2080435259&abtest='
urllib.request.urlretrieve(url_video,'bilibili.mp4')
请求对象的定制
- UA:User Agent中文名为用户代理,简称 UA,它是一个特殊字符串头,使得服务器能够识别客户使用的操作系统 及版本、CPU 类型、浏览器及版本。浏览器内核、浏览器渲染引擎、浏览器语言、浏览器插件等
"""
请求对象的定制
Author:binxin
Date:2023/11/20 20:37
"""
import urllib.request
url = 'https://www.baidu.com'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
}
request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
print(content)
编解码
- get请求方式:urllib.parse.quote()
"""
get请求的quote方法
Author:binxin
Date:2023/11/21 11:10
"""
import urllib.request
import urllib.parse
url = 'https://www.bing.com/search?q='
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
}
url = url + name
request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
print(content)
- get请求方式:urllib.parse.urlencode()
"""
get请求urlencode方法
Author:binxin
Date:2023/11/21 11:26
"""
import urllib.parse
import urllib.request
url = 'https://www.baidu.com/s?'
data = {
'wd': '周杰伦',
'sex': '男',
'location': '中国台湾省'
}
data = urllib.parse.urlencode(data)
url = url + data
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
}
request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
print(content)
- post请求方式
"""
post请求
Author:binxin
Date:2023/11/21 16:26
"""
import urllib.request
import urllib.parse
url = 'https://fanyi.baidu.com/sug'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
}
data = {
'kw': 'spider'
}
data = urllib.parse.urlencode(data).encode('utf-8')
request = urllib.request.Request(url=url, data=data, headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
import json
obj = json.loads(content)
print(obj)
"""
百度详细翻译
Author:binxin
Date:2023/11/21 16:40
"""
import urllib.request
import urllib.parse
url = 'https://fanyi.baidu.com/v2transapi?from=en&to=zh'
headers = {
'Cookie': 'REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; PSTM=1690027568; ZFY=hJHsAJcm:BzM1AXD9a0vLfFZGJrzgW2kpVMqv0v6Ps1o:C; H_WISE_SIDS=234020_216844_213353_214793_110085_244716_257731_257015_260234_253022_259300_261715_236312_256419_265302_265881_266361_265776_267288_267371_266846_267421_265615_267405_265986_256302_266188_267898_259033_266713_268406_268593_268030_268842_259643_269232_269388_268766_188333_269730_269832_269904_269803_269049_267066_256739_270460_270534_267528_270625_270664_270548_270922_270966_271039_268874_270793_271169_271175_271193_268728_269771_267782_268987_269034_271229_269621_267659_271319_265032_269892_266027_270482_269609_270102_271608_270876_270443_269785_270157_271671_271985_271813_271957_271954_271943_256151_269211_234295_234207_266324_271187_272225_270055_272279_263618_267596_272055_272366_272008_272337_267559_272460_271145_8000076_8000108_8000124_8000136_8000159_8000164_8000168_8000177_8000179_8000186_8000203; BAIDU_WISE_UID=wapp_1692517164729_638; __bid_n=18a11e266418064ed3a010; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1692497260,1692755924,1693024343,1693128438; H_WISE_SIDS_BFESS=234020_216844_213353_214793_110085_244716_257731_257015_260234_253022_259300_261715_236312_256419_265302_265881_266361_265776_267288_267371_266846_267421_265615_267405_265986_256302_266188_267898_259033_266713_268406_268593_268030_268842_259643_269232_269388_268766_188333_269730_269832_269904_269803_269049_267066_256739_270460_270534_267528_270625_270664_270548_270922_270966_271039_268874_270793_271169_271175_271193_268728_269771_267782_268987_269034_271229_269621_267659_271319_265032_269892_266027_270482_269609_270102_271608_270876_270443_269785_270157_271671_271985_271813_271957_271954_271943_256151_269211_234295_234207_266324_271187_272225_270055_272279_263618_267596_272055_272366_272008_272337_267559_272460_271145_8000076_8000108_8000124_8000136_8000159_8000164_8000168_8000177_8000179_8000186_8000203; APPGUIDE_10_6_5=1; APPGUIDE_10_6_6=1; BAIDUID_BFESS=EBC2B0F02DE54CA945DEC2A522C58DC0:FG=1; APPGUIDE_10_6_7=1; APPGUIDE_10_6_9=1; BDUSS=VBqWS16dlBxNHBaQjA4c04yM2Z3S1hJVEp2MGY4YWpCenZ0flZMVnZwSlZobjFsRVFBQUFBJCQAAAAAAQAAAAEAAAA1wVhn1r3Uwsz9xM~J-QAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFX5VWVV-VVlU; BDUSS_BFESS=VBqWS16dlBxNHBaQjA4c04yM2Z3S1hJVEp2MGY4YWpCenZ0flZMVnZwSlZobjFsRVFBQUFBJCQAAAAAAQAAAAEAAAA1wVhn1r3Uwsz9xM~J-QAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFX5VWVV-VVlU; BIDUPSID=EBC2B0F02DE54CA945DEC2A522C58DC0; H_PS_PSSID=39669_39663_39676_39678_39710_39713_39749_39674_39785_39703_39793_39682; BA_HECTOR=248l252k80ak00240l2080051ilmkii1r; RT="z=1&dm=baidu.com&si=82643ae1-aaab-49e1-b3e4-a3c61e4bb037&ss=lp6w2pjo&sl=1&tt=1s7&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf&ld=2ui&ul=smo&hd=sn6"; ab_sr=1.0.1_YTNiYWUxNWM0YmU5MWRlMDdjNWY2MThlOTA0NDEzNmEwM2FhNTFkNzRiYzVkMjI4YTdjNjI5MTU5OWZlNzk4ZDU3NmViNmMzMjhlNTk2ZTI0ZDUzMTQzMTQzZTJiYWNiODBmOTVkYzVkOGQ1NWY1MGY2NDNlNTBmYzk4Njg1OWU5Y2IyZTA2OWRmYjQ4MjRhYWM2MWFiN2FkYTRhYjM5Y2NjMmE1NmYwMzFiMTgxNGQ1YjdjMGEwYzczZWU2NWMy',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
}
data = {
'from': 'en',
'to': 'zh',
'query': 'spider',
'transtype': 'realtime',
'simple_means_flag': '3',
'sign': '63766.268839',
'token': 'ae16933c30637316aa2381165ae3e29a',
'domain': 'common',
'ts': '1700555095216'
}
data = urllib.parse.urlencode(data).encode('utf-8')
request = urllib.request.Request(url=url, data=data, headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
import json
obj = json.loads(content)
print(obj)
ajax的get请求
"""
ajax get请求
Author:binxin
Date:2023/11/23 19:16
"""
import urllib.request
url = "https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&start=0&limit=20"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
}
request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
with open('douban1.json', 'w', encoding='utf-8') as fp:
fp.write(content)
"""
豆瓣电影前十页
Author:binxin
Date:2023/11/23 19:30
"""
import urllib.parse
import urllib.request
def create_request(page):
base_url = "https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&"
data = {
'start': (page - 1) * 20,
'limit': 20
}
data = urllib.parse.urlencode(data)
url = base_url + data
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
}
request = urllib.request.Request(url=url, headers=headers)
return request
def get_content(request):
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
return content
def down_load(page, content):
with open(f'douban{page}.json', 'w', encoding='utf-8') as fp:
fp.write(content)
if __name__ == '__main__':
start_page = int(input("起始页码:"))
end_page = int(input("结束页码:"))
for page in range(start_page, end_page + 1):
request = create_request(page)
content = get_content(request)
down_load(page, content)
ajax的post请求
"""
ajax post
Author:binxin
Date:2023/11/23 20:15
"""
import urllib.request
import urllib.parse
def create_request(page):
base_url = "https://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
}
data = {
'cname': '北京',
'pid': '',
'pageIndex': page,
'pageSize': '10',
}
data = urllib.parse.urlencode(data).encode('utf-8')
request = urllib.request.Request(url=base_url, data=data, headers=headers)
return request
def get_content(request):
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
return content
def down_load(page, content):
with open(f'kfc{page}.json', 'w', encoding='utf-8') as fp:
fp.write(content)
if __name__ == '__main__':
start_page = int(input("起始页码:"))
end_page = int(input("结束页码:"))
for page in range(start_page, end_page + 1):
request = create_request(page)
content = get_content(request)
down_load(page, content)
URLError\HTTPError
- HTTPError类是URLError类的子类
- 导入的包
urllib.error.HTTPError
urllib.error.URLError
- http错误:http错误是针对浏览器无法连接到服务器而增加出来的错误提示。引导并告诉浏览者该页是哪里出了问题
- 通过urllib发送请求的时候,有可能会发送失败,这个时候如果想让你的代码更加的健壮,可以通过try-except进行捕获异常,异常有两类,URLError\HTTPError
"""
URLError/HTTPError
Author:binxin
Date:2023/11/23 20:40
"""
import urllib.request
import urllib.error
url = 'https://www.goudan111.com'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
}
try:
request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
print(content)
except urllib.error.HTTPError:
print('系统正在升级...')
except urllib.error.URLError:
print('系统升级')
Cooking登录
"""
cooking登录
在数据采集需要绕过登录
Author:binxin
Date:2023/11/23 20:51
"""
import urllib.request
url = "https://m.weibo.cn/profile/7844546355"
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.9',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
'Cookie': 'WEIBOCN_FROM=1110006030; SUB=_2A25IWztpDeRhGeFG71YU9CjPzjmIHXVrGTKhrDV6PUJbkdANLUHckW1NeW0UMmSkDnXteuMWZ6_P3Hrnm486Vsys; MLOGIN=1; _T_WM=60448710269; M_WEIBOCN_PARAMS=lfid%3D102803%26luicode%3D20000174%26uicode%3D20000174; XSRF-TOKEN=fd466d; mweibo_short_token=5c3125a1a8',
'Mweibo-Pwa': '1',
'Referer': 'https://m.weibo.cn/profile/7844546355',
'Sec-Ch-Ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': 'Windows',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin'
}
request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
with open('weibo.html', 'w', encoding='utf-8') as fp:
fp.write(content)
Handler处理器
"""
Handler处理器
Author:binxin
Date:2023/11/24 14:16
"""
import urllib.request
url = "https://www.baidu.com"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
}
request = urllib.request.Request(url=url, headers=headers)
handler = urllib.request.HTTPSHandler()
opener = urllib.request.build_opener(handler)
response = opener.open(request)
content = response.read().decode('utf-8')
print(content)
代理服务器
"""
代理服务器
Author:binxin
Date:2023/11/24 14:25
"""
import urllib.request
import urllib.parse
url = 'https://www.baidu.com/s?wd=ip'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
}
request = urllib.request.Request(url=url, headers=headers)
proxies = {
'http': '121.226.89.230:20516'
}
handler = urllib.request.ProxyHandler(proxies=proxies)
opener = urllib.request.build_opener(handler)
response = opener.open(request)
content = response.read().decode('utf-8')
with open('daili.html', 'w', encoding='utf-8') as fp:
fp.write(content)
代理池
"""
代理池
Author:binxin
Date:2023/11/24 14:47
"""
import random
import urllib.request
proxies_pool = [
{'http': '42.249.189.41:17666'},
{'http': '27.154.221.103:19542'}
]
proxies = random.choice(proxies_pool)
url = 'http://www.baidu.com/s?wd=ip'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
}
request = urllib.request.Request(url=url, headers=headers)
handler = urllib.request.ProxyHandler(proxies=proxies)
opener = urllib.request.build_opener(handler)
response = opener.open(request)
content = response.read().decode('utf-8')
with open('daili.html', 'w', encoding='utf-8') as fp:
fp.write(content)