为了满足自己的好奇心,想运用一下Python知识,然后和小伙伴要来了一个爬虫学习的B站博主的视频,教你学爬虫,真的很详细。
先上一个目录结构压压惊:
接下来就是贴代码笔记还有一个第三方库的目录:
1.requests的用法:
import requests
url = 'https://www.baidu.com'
res = requests.get(url=url)
print(res)
print(res.status_code)
print(res.content)
print(res.content.decode('utf-8'))
print(res.text)
print(res.url)
print(res.request.headers)
print(res.headers)
import requests
url = 'https://www.xicidaili.com/nn'
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.116 Safari/537.36'
}
res = requests.get(url=url,headers=headers)
code = res.status_code
print(code)
if code == 200:
with open('./text.html','w',encoding='utf-8') as fp:
fp.write(res.text)
import requests
url = 'https://fanyi.baidu.com/sug'
word = input('请输入要翻译的中文:')
headers = {
}
data = {
'kw':word
}
res = requests.post(url=url,headers=headers,data=data)
code = res.status_code
print(code)
print(res.text)
print(res.json())
if code ==200:
print('请求成功')
data = res.json()
if data['errno'] == 0:
print('响应成功')
k = data['data'][0]['k']
v = data['data'][0]['v'].split(';')[-2]
print(k+'-->'+v)
import requests
url = 'http://www.rrys2019.com/user/user'
loginUrl = 'http://www.rrys2019.com/User/login/ajaxLogin'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.116 Safari/537.36'
}
req = requests.session()
data = {
'account' : 'yichuan@itxdl.cn',
'password' : 'pyTHON123',
'remeber' : '1',
'url_back' : 'http://zmz2019.com/user/user'
}
res = req.post(url=loginUrl,headers=headers,data=data)
code = res.status_code
print(code)
if code == 200:
res = req.get(url=url,headers=headers)
with open('rr.html','w',encoding='utf-8') as fp:
fp.write(res.text)
2.Xpath的使用
from lxml import etree
text ='''
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>老白的登陆界面</title>
<link rel="stylesheet" href="web.css"/>
</head>
<body>
<div >
<div class="kuai">
<p class="inpu">账号:</p>
<input type="text" value="请输入账号"/><br />
<p class="inpu">密码:</p>
<input type="password" /><br />
<br />
<input type="submit" value="登陆"/>
<input type="submit" value="忘记密码" />
</div>
</div>
</body>
</html>
'''
html = etree.HTML(text)
r = html.xpath('/html/body/div/div/p/text()')
print(r)
r = html.xpath('/html/body/div/div/p[1]/text()')
print(r)
html = etree.parse('./test.html',etree.HTMLParser())