一.get爬虫
import requests
content = input("请输入你要搜索的歌手")
url = f"https://www.sogou.com/web?query={content}"
# 添加一个请求头信息
header = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}
# 简单的反爬
resp = requests.get(url,headers=header)
print(resp.text)
import requests
import json
url = "https://movie.douban.com/j/chart/top_list"
data = {
"type": "13",
"interval_id": "100:90",
"action": "",
"start": "0",
"limit": "20"
}
hesder = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}
# 发送请求
# 发送get请求时params
resp = requests.get(url,params=data,headers=hesder)
# 返回文本
# print(resp.text)
print(resp.json())
# 可以返回他的url
print(resp.request.url)
二,post爬虫
# post请求,会出现json文本
import requests
import json
url = "https://fanyi.baidu.com/sug"
data = {
"kw":input("请输入一个单词")
}
# 请求
resp = requests.post(url,data=data)
# 返回请求文本
# 表示返回字典
print(resp.json())
三.re模块
用于测试正则表达式的标签
1.运用正则表达式解析原理
# re
# import re
# # findall表示查找所有
# result = re.findall(r"\d+","我今年19岁")
# print(result)
# 重点
# result = re.finditer(r"\d+","我有10岁,我有88888888")
# # 得到的是迭代器
# print(result)
# #表示匹配
# for item in result:
# # 从匹配得到的结果中拿到数据
# print(item.group())
# search只会匹配第一次的内容
# result = re.search(r"\d+","我是南栀北夏,19岁,在221班")
# print(result.group()) #19
# match,在匹配的时候,从字符串开始匹配,也是只匹配一个,类似于在正则前面加^
# result = re.match(r"\d+","我是南栀北夏,19岁,在221班")
# print(result) #None
# 预加载,提前把正则对象加载完毕
# obj = re.compile(r"\d+")
# # 直接使用
# result = obj.findall("我是南栀北夏,19岁,在221班")
# print(result) #['19', '221']
# 例:
import re
s = """
<div class="西游记"><span id="10010">中国联通</span></div>
<div class="西游记"><span id="10086">中国移动</span></div>
"""
# 预加载
obj = re.compile(r"<span id='(?P<id>\d+)'>(?P<name>.*?)</span>")
result = obj.finditer(s)
for item in result:
# 可以提取出id名字的全部数据
id = item.group("id")
print(id)
2.运用正则表达式爬取豆瓣电影
# 1.拿到数据源代码
# 2.编写正则,提取页面数据
# 3.保存数据
import requests
import re
# 保存的地点
f = open("top250.csv","a",encoding="utf-8")
url = "https://movie.douban.com/top250"
# 出现空时需要反爬
header = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"
}
# 响应
resp = requests.get(url,headers=header)
# 出现乱码
resp.encoding = "utf-8"
# 返回数据
wen = resp.text
# 编写正则
# re.S可以让正则中的.匹配换行符
obj = re.compile(r'<div class="item">.*?<span class="title">(?P<name>.*?)</span>.*?<p class="">.*?导演: (?P<dao>.*?) .*?<br>(?P<year>.*?) .*?<span class="rating_num" property="v:average">(?P<fen>.*?)</span>.*?<span>(?P<people>.*?)人评价</span>', re.S)
# 进行正则匹配
result = obj.finditer(wen)
for item in result:
name = item.group("name")
dao = item.group("dao")
year = item.group("year").strip() #去掉字符串左右两边的空白
fen = item.group("fen")
people = item.group("people")
# 保存
f.write(f"{name},{dao},changshi{year},{fen},{people}")
# 可以打印
# print(name,dao,year,fen,people)
f.close()
resp.close()
print("提取完成")
3.爬取网址
import requests
import re
url = "https://www.dytt8.net/"
resp = requests.get(url)
resp.encoding = "gbk"
# 返回数据
wen = resp.text
# 预加载
obj = re.compile(r"最新电影更新:.*?<ul>(?P<html>.*?)</ul>",re.S)
# 找取数据
result1 = obj.search(wen)
# 提取html,,就是a标签
html = result1.group("html")
# print(html)
# 提取href
obj2 = re.compile(r"<a href='(?P<href>.*?)'>(?P<name>.*?)</a>",re.S)
href = obj.finditer(html)
for item in href:
href1 = item.group("href")
print("href1")
四,bs4解析
安装bs4
pip install bs4
# 从bs4这个包了导入beautifulsoup
from bs4 import BeautifulSoup
html = """
<ul>
<li><a href="nb.com">张三</a></li>
<li id="abc"><a href="baidu.com">李四</a></li>
<li><a href="sb.com">甲乙</a></li>
<li id="abc"><a href="jinghan.com">丙丁</a></li>
</ul>
"""
# 1.初始化BeautifulSoup对象
page = BeautifulSoup(html,"html.parser")
# find表示查找元素,只会查找一个结果
# page.find("标签名",attrs={"属性":"值"})
bian = page.find("li",attrs={"id":"abc"})
print(bian) #<li id="abc"><a href="baidu.com">李四</a></li>
# find_all表示查找元素,找到一堆结果
# page.find_all("标签名",attrs={"属性":"值"})
bian2 = page.find_all("li",attrs={"id":"abc"})
print(bian2)
# 结果如下:
# [<li id="abc"><a href="baidu.com">李四</a></li>, <li id="abc"><a href="jinghan.com">丙丁</a></li>]
# 同时也可以继续在li标签的基础上接着找
a = bian.find("a")
print(a) #<a href="baidu.com">李四</a>
# 提取a标签的文本
print(a.text) #李四
# 拿属性
# get("属性名")
print(a.get("href")) #baidu.com
# 提取上面的内容
li_list = page.find_all("li")
for i in li_list:
a = i.find("a")
text = i.text
href = i.get("href")
print(text,href)
五,xpath解析
下载:pip install lxml
from lxml import etree
xml = """
<book>
<id>1</id>
<name>西游记</name>
<price>100</price>
<nick>nb</nick>
<author>
<nick id="10086">唐僧</nick>
<nick id="10010">悟空</nick>
<nick class="join">周杰伦</nick>
<nick class="joy">sb</nick>
<div>
<nick>肉类</nick>
</div>
</author>
<partner>
<nick id="p">潘子</nick>
<nick id="c">小哥</nick>
</partner>
</book>
"""
# 练习只能用XML
et = etree.XML(xml)
# /表示根节点
result = et.xpath("/book")
print(result) #[<Element book at 0x1c54bc55c00>]
# 表示根节点下的name
result_1 = et.xpath("/book/name")
print(result_1) #[<Element name at 0x1b9be8691c0>]
# /text()表示拿文本
result_2 = et.xpath("/book/name/text()")
print(result_2) #['西游记']
# 只要文字
result_3 = et.xpath("/book/name/text()")[0]
print(result_3) #西游记
# //表示所有的子孙后代
# result_4 = et.xpath("/book//nick")
# print(result_4) #[<Element nick at 0x24dea8a5200>, <Element nick at 0x24dea8a5240>, <Element nick at 0x24dea8a5300>, <Element nick at 0x24dea8a5340>, <Element nick at 0x24dea8a5380>, <Element nick at 0x24dea8a5400>, <Element nick at 0x24dea8a5440>, <Element nick at 0x24dea8a5480>]
# /*/表示谁都行的后代,表示孙子
result_5 = et.xpath("/book/*/nick/text()")
print(result_5) #['唐僧', '悟空', '周杰伦', 'sb', '潘子', '小哥']
# []表示属性的筛选,@属性值=值
result_6 = et.xpath("/book/author/nick[@class='join']/text()")
print(result_6) #['周杰伦']
# 最后一个/表示拿到nick里面的id的内容,@属性,可以直接拿到属性值。
result_6 = et.xpath("/book/partner/nick/@id")
print(result_6) #['p', 'c']
import requests
from lxml import etree
from bs4 import BeautifulSoup
url = "https://www.zbj.com/qklkfzbj/f.html?fr=zbj.sy.zyyw_2nd.lv3&r=2"
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}
data = {
"fr":"zbj.sy.zyyw_2nd.lv3",
"r":2
}
repe = requests.get(url,headers=headers,params=data)
repes = repe.text
# 加载数据
et = etree.HTML(repes)
page = BeautifulSoup(repes,"html.parser")
divs = et.xpath("//div[@class='search-result-list-service']/div")
div_1 = page.find_all("div",attrs={"class":"price"})
for element in div_1:
div_2 = element.find_all("span")
for i in div_2:
text = i.text
# print(text)
div_3 = page.find_all("div",attrs={"class":"name-pic-box"})
for j in div_3:
div_4 = j.find_all("a")
for y in div_4:
text_1 = y.text
print(text,text_1)
六.pyquery基础
#下载pyquery: pip install pyquery
from pyquery import PyQuery
html = """
<li><a href="http://www.baidu.com">百度</a></li>
"""
# 加载html内容
p = PyQuery(html)
print(p) #<li><a href="http://www.baidu.com">百度</a></li>
# 表示pyquery对象
print(type(p)) #<class 'pyquery.pyquery.PyQuery'>
# PyQuery对象直接css选择器
li = p("a")
print(li) #<a href="http://www.baidu.com">百度</a>
# 依然是pyquery对象
print(type(li)) #<class 'pyquery.pyquery.PyQuery'>
# 链式操作
# a = p("li")("a")
a = p("li a")
print(a) #<a href="http://www.baidu.com">百度</a>
from pyquery import PyQuery
html = """
<ul>
<li class="aaa"><a href="http://www.baidu.com"></a>百度</li>
<li class="aaa"><a href="http://www.tengxun.com"></a>腾讯</li>
<li class="bbb" id="qq"><a href="http://www.qq.com"></a>南栀</li>
<li class="bbb"><a href="http://www.ji.com"></a>爱企业</li>
</ul>
"""
# 加载内容
p = PyQuery(html)
# class="aaa"
a = p(".aaa")
print(a) #<li class="aaa"><a href="http://www.baidu.com"/>百度</li>
#<li class="aaa"><a href="http://www.tengxun.com"/>腾讯</li>
b = p(".aaa a")
print(b) #<a href="http://www.baidu.com"/>百度<a href="http://www.tengxun.com"/>腾讯
c = p("#qq a")
print(c) #<a href="http://www.qq.com"/>qq
# 拿属性
href = p("#qq a").attr("href")
print(href) #http://www.qq.com
# 拿文本
text = p("#qq a").text()
print(text) #南栀
from pyquery import PyQuery
html = """
<ul>
<li class="aaa"><a href="http://www.baidu.com"></a>百度</li>
<li class="aaa"><a href="http://www.tengxun.com"></a>腾讯</li>
<li class="bbb" id="qq"><a href="http://www.qq.com"></a>南栀</li>
<li class="bbb"><a href="http://www.ji.com"></a>爱企业</li>
</ul>
"""
# 加载内容
p = PyQuery(html)
# 如果多个标签同时拿属性,只能默认拿到第一个
# 多个标签拿属性
it = p("li a").items()
print(it) #<generator object PyQuery.items at 0x00000157EB08E6C0>表示迭代器
for item in it:
href = item.attr("href")
text = item.text()
# print(text,href)
print(text, end=" ")
pyquery可以改变标签的结构:
from pyquery import PyQuery
html = """
<HTML>
<div class="aaa">哒哒哒</div>
<div class="bbb">嘻嘻嘻</div>
</HTML>
"""
# 加载内容
p = PyQuery(html)
# after在xxx标签后面添加xxx标签
s_1 = p("div .aaa").after("""<div class="ccc">嗯嗯嗯</div>""")
# append在xxx标签内部添加xxx标签
s_2 = p("div .aaa").append("""<span>nb真牛</span>""")
# remove删除标签
s_3 = p("div .aaa").remove("""<span>nb真牛</span>""")
# remove_attr删除属性
s_4 = p("div .aaa").remove_attr("class")
pyquery的实战案例:
import requests
from pyquery import PyQuery
# 获取源代码的函数
def get_page(url):
resp = requests.get(url)
resp.encoding = 'utf-8'
return resp.text
# 解析代码的函数
def jie_page(html):
doc = PyQuery(html)
list = doc(".list_user_msg__Mw2CQ").items()
# print(list)
for user in list:
p = user("div > div:nth-child(2) > ul > li:nth-child(1)").text()
print(p)
# 入口函数
def main():
url = "https://k.autohome.com.cn/146"
# 获取源代码
html = get_page(url)
# 解析代码
jie_page(html)
if __name__ == '__main__':
main()
七.cookie的处理:
import requests
# 会话
session = requests.session()
data = {
"loginName": "你的用你户名",
"password": "你的密码"
}
# 登录
url = "https://passport.17k.com/ck/user/login"
session.post(url,data=data)
# print(resp.text)
# 进入书架
resp = session.get("https://user.17k.com/www/bookshelf/")
resp.encoding = "utf-8"
# 获取书架上的数据
resp_1 = session.get("https://www.17k.com/ck/book/3377676/chapter/48099192?subAllPrice=1&appKey=2406394919")
resp_1.encoding = "utf-8"
# print(resp.text)
print(resp_1.text)