1.requets
爬虫的一般步骤:
-
指定url
-
发起请求
-
获取响应
-
持久储存
爬取搜狗首页
import requests
# 爬取搜狗的首页
if __name__ == '__main__':
url = "https://www.sogou.com/"
response = requests.get(url=url)
# print(response.text)
with open('./sougou.html',"w",encoding='utf-8') as fp:
fp.write(response.text)
print("爬取结束!!!")
-
案例1
爬取搜狗指定词条对应的搜索结果页面(简易的网页采集器)
import requests if __name__ == '__main__': url = 'https://www.sogou.com/web' # UA 伪装,伪装成浏览器 header = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36" } kw = input("enter a word:") #携带的参数封装成字典 param={ 'query' : kw } response = requests.get(url=url,params=param,headers=header) page_text = response.text filename = kw+'.html' with open(filename,'w',encoding='utf-8') as fp: fp.write(page_text) print(filename,'保存成功')
-
案例2
破击百度翻译
Post 请求(携带参数)
响应数据为json
import requests
import json
if __name__ == '__main__':
url = 'https://fanyi.baidu.com/sug'
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"
}
word = input("input a word:")
data = {
"kw": word
}
response = requests.post(url=url,data=data,headers=header)
dic_obj = response.json()
print(dic_obj['data'][0]['v'])
# Filename = word+'.josn'
# fp = open(Filename,'w',encoding='utf-8')
#
# json.dump(dic_obj,fp=fp,ensure_ascii=False)# 有中文不能用ascii
# print("over!!!")
- 案例3
爬取豆瓣电影的分类排行https://movie.douban.com/中的电影详情数据
import requests
if __name__ == '__main__':
url = 'https://movie.douban.com/j/chart/top_list'
mapa = {
'type': '24',
'interval_id': '100:90',
'start': '0',
'limit': '50'
}
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"
}
response = requests.get(url=url,params=mapa,headers=header);
list_data = response.json()
# print(list_data)
for cont, data in enumerate (list_data):
title =list_data[cont]['title']
score = list_data[cont]['score']
msg = "影片:{},评分:{}".format(title,score)
print(msg)
-
案例4
爬取肯德基餐厅查询http://www.kfc.com.cn/kfccda/index.aspx指定地点餐厅数
import requests if __name__ == '__main__': #url 为完整的不能随意删减 url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword' # adress = input("请输入要查询的地点:\r\n") data = {# 空的字符串也需要添加 'cname':'', 'pid':'', 'keyword': '北京', 'pageIndex': '1', 'pageSize': '10' } header = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36" } reponse = requests.post(url=url,data=data,headers=header) data_list = reponse.json()['Table1'] # print(data_list) for data in data_list: storeName = data['storeName'] addressDetail = data['addressDetail'] print('店名:{}餐厅,地址:{}'.format(storeName,addressDetail))
2.数据解析
-
聚焦爬虫: 爬取页面中指定的内容;
编码流程:
-
指定url
-
发起请求
-
获取响应数据
-
数据解析
-
持久储存
-
数据解析的分类:
- 正则
- bs4
- xpath(***)
-
数据解析的原理
- 解析的局部文本内容会在标签对应的属性中进行存储
-
- 进行指定标签的定位
- 标签或者标签对应属性中存储的数据进行提取(解析)
正则
正则在线工具在线正则表达式测试 (oschina.net)
- 元字符: 具有固定含义的特殊字符
常用的元字符:
1 . 匹配除换行符意外的任意字符(一个字符)
2 \w 匹配字母或数字或下划线
3 \s 匹配任意的空白符
4 \d 匹配数字
5 \n 匹配一个换行符
6 \t 匹配制表符
7 ^ 匹配字符串的开始
8 $ 匹配字符串的结尾
9 \W 匹配非字母或数字或下划线
10 \D 匹配非数字
11 \S 匹配非空白符串
12 a|b 匹配字符a或字符b
13 () 匹配括号内的表达式,表示一个组
14 [...] 匹配字符组中的字符
15 [^...] 匹配除了字符组中字符所有的字符
量词:控制前面的元字符出现的次数
1 * 重复零次或更多次
2 + 重复一次或更多次
3 ? 重复零次或一次
4 {n} 重复n次
5 {n,} 重复n次或更多次
6 {n,m} 重复n次到m次
贪婪匹配或惰性匹配
.* 贪婪匹配
.*? 惰性匹配
2.1 re模块
re.findall()--> 列表# 匹配字符串所有的符合正则的内容
# finditer: 匹配字符串中所有的内容(返回的时迭代器),从迭代器中拿到需要.group()
it = re.finditer(r"d+","我的电话是10086,他的女朋友的电话是10010")
for i in it:
print(i.group())
#search 找到一个结果就返回,返回的结果是match对象,拿数据需要.group()
re.search()
#mathch 从头开始匹配
re.match()
# 预加载正则表达式
obj = re.complie(r"d+",re.S)#re.S让 . 能匹配换行符
obj.finditer(r"d+","我的电话是10086,他的女朋友的电话是10010")
正则中提取数据
(?P<分组的名字>(.*?))可以单独从正则匹配的内容中进一步提取内容。
- 实例 手刃 豆瓣TOP250https://movie.douban.com/top250
import requests
import re
import csv
if __name__ == '__main__':
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"
}
obj = re.compile(r'class="">.*?<span class="title">(?P<name>.*?)</span>'
r'.*?<br>(?P<year>.*?) .*?<span>(?P<person>.*?)人评价', re.S)
for page in range(0,50,25):
url = "https://movie.douban.com/top250?start={}&filter=".format(page)
res_obj = requests.get(url= url,headers= header)
result = obj.finditer(res_obj.text)
f= open("data.csv",mode="w",encoding='utf-8')
csvwiter = csv.writer(f)
for it in result:
# print(i.group("name"))
# print(i.group("year").strip())
# print(i.group("person"))
dic=it.groupdict()
dic['year'] = dic['year'].strip()
csvwiter.writerow(dic.values())
f.close()
print("over!!!!!!")
- 案例 电影天堂整页数据https://dytt89.com/
import requests
import re
if __name__ == '__main__':
mani_url = "https://dytt89.com/"
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"
}
resopn = requests.get(url=mani_url,headers = header,verify = False)#忽略证书
resopn.encoding = 'gb2312' # 指定字符集
# print(resopn.text)
obj1 = re.compile(r"2022必看热片.*?<ul>(?P<ul>.*?)</ul>",re.S)
obj2 = re.compile("<a href='(?P<href>.*?)' title=",re.S)
obj3 = re.compile('◎片 名 (?P<movie>.*?)<br />.*?'
'<td style="WORD-WRAP: break-word" bgcolor="#fdfddf"><a href="(?P<download>.*?)">magnet',re.S)
result1 = obj1.search(resopn.text)
uls = result1.group('ul')
# result1 = obj1.finditer(resopn.text)
# for iter in result1:
# print(iter.group("ul"))
child_list = []
result2 = obj2.finditer(uls)
for it in result2:
#拼接字符
child_url = mani_url + it.group('href').strip("/")
# print(child_url)
child_list.append(child_url)
# 提取子页面
for herf in child_list:
child_res = requests.get(url=herf,headers=header,verify=False)
child_res.encoding ="gb2312"
# print(child_res.text)
result3 = obj3.search(child_res.text)
print(result3.group('movie'))
print(result3.group('download'))
2.2 bs4 模块
安装 pip install bs4
-
解析数据过程
- 把源代码交给BeautifulSoup进行处理,生产bs对象
- html.parser 指定html解析器
find,只找一个
find(标签,属性=[])
findall 找所有标签, 用法同上
-
提供的用于数据解析的方法和属性
-
soup.tagName :返回的是文档中第一次出现的tagName的对应的标签。
-
soup.find:
-
- 等同于soup.tagName.
- 属性定位: soup_find(‘div’,class_ = " ")# 区分关键字加下划线
-
-
soup.findall :返回列表
-
soup.select(‘某种选择器,id,class,标签选择器’):返回一个列表
- 层级选则器(./tang>li) > :表示一个层级
- 空格表示多个层级
- 层级选则器(./tang>li) > :表示一个层级
-
获取标签之间的文本数据
- soup.a.text/string/get_text()
- text/get_text() 获取某一标签中所有的文本内容
- string 只可以获取该标签中下面直系的文本内容
- soup.a.text/string/get_text()
-
获取文本中的属性值:
- soup.a[‘属性内容’]
-
-
案例1 爬取新发地蔬菜价格的数据(未用的bs4)
import requests
if __name__ == '__main__':
url = "http://www.xinfadi.com.cn/getPriceData.html"
for i in range(1,3):
data = {
'imit': '20',
'current':i,
'pubDateStartTime': '',
'pubDateEndTime': '',
'prodPcatid': '1186',
'prodCatid':'',
'prodName':' '
}
res = requests.post(url=url,data=data).json()
# print(res['list'])
data_list = res['list']
for data in data_list:
prodName = data['prodName']
lowPrice = data['lowPrice']
avgPrice = data['avgPrice']
highPrice = data['highPrice']
place = data['place']
pubDate = data['pubDate']
print(prodName,lowPrice,avgPrice,highPrice,place,pubDate)
print("over one page")
print("complite")
-
案例 获取三国演义的文章题目和内容名句大全_古诗文网 (gushiwen.cn)古诗文网
import requests from bs4 import BeautifulSoup if __name__ == '__main__': url = 'https://so.gushiwen.cn/guwen/book_46653FD803893E4F7F702BCF1F7CCE17.aspx' headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.56"} page_text = requests.get(url=url,headers=headers).text # print(page_text) soup = BeautifulSoup(page_text,'lxml') herfs_list = soup.select('.bookcont >ul > span>a')# 层级选择标签 第一个参数为某个标签的属性 # print(herfs_list) fp = open("./三国演义.txt","w",encoding="utf-8") for herf in herfs_list: mulu = herf.text # 获取herf下的文本内容 child_url = herf['href'] detail_text = requests.get(url=child_url,headers=headers).text detail_soup = BeautifulSoup(detail_text,'lxml') # print(detail_soup) title = detail_soup.select('.contson > p')[0].text # 从文本中获取目录标题 cont = detail_soup.find("div", class_="contson") # print(cont.text) cont = detail_soup.find("div",class_="contson").text fp.write(mulu+cont) print(mulu+ title + " over!!!") print("爬取成功!!!") fp.close()
-
壁纸高清图片爬取风景壁纸_电脑桌面壁纸_壁纸大全 (bizhi360.com)
import requests
from bs4 import BeautifulSoup
import os
import time
if __name__ == '__main__':
print("请选择要下的图片信息:\n1.风景 ")
print("2.美女\n3.动漫\n4.非主流\n5.创意\n6.可爱\n7.卡通\n8.动物\n9.汽车 ")
kind_dic= {"1":"fengjing","2":"meinv","3":"dongman","4":"feizhuliu","5":"chuangyi","6":"keai","7":"katong","8":"dongwu","9":"qiche"}
choies = input("请选择:\n")
ser = kind_dic[choies]
url = 'http://bizhi360.com/'
headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.56"}
res = requests.get(url=url+ser,headers= headers)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text,'lxml')
li_list = soup.select(".pic-list > ul >li > a")
# print(li_list)
if not os.path.exists("./pic"):
os.makedirs("./pic")
# fp = open("./pic",mode="wb")
for li in li_list:
title = li.text
href = li['href'].strip("/")
chrild_url = url + href
chrild_res = requests.get(url=chrild_url,headers=headers)
chrild_res.encoding = 'utf-8'
chrild_soup = BeautifulSoup(chrild_res.text,'lxml')
# print(chrild_soup)
pic_url = chrild_soup.select('.article> figure> a')[0]['href']
pic = requests.get(url=pic_url, headers=headers).content
name = title +".jpg"
with open(f"./pic/{name}","wb") as fp:
fp.write(pic)
print(name+"over!!!")
time.sleep(1)
print("爬取成功!!!")
- 修改为全站爬取
import requests
from lxml import etree
import os
import time
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
def downlode_page(url):
page = requests.get(url=url, headers=headers)
tree = etree.HTML(page.text)
li_list = tree.xpath('//ul[@class="clearfix"]/li')
for li in li_list:
url1 = main_url + li.xpath("./a/@href")[0]
resp = requests.get(url1, headers=headers)
resp.encoding='gbk'
child_tree = etree.HTML(resp.text)
pic2_url = child_tree.xpath('//*[@id="img"]/img/@src')[0]
title = child_tree.xpath('//*[@id="img"]/img/@title')[0]
pic_url = main_url + pic2_url
img_res = requests.get(pic_url)
# # 通用处理中文乱码解决方案
# title = title.encode('iso-8859-1').decode('gbk')
# pic = requests.get(url=pic_url,headers=headers).content
name = title +".jpg"
with open(f"./4k图片/{name}", "wb") as fp:
fp.write(img_res.content)
print(name+"over!!!")
time.sleep(3)
print("提取完毕")
if __name__ == '__main__':
main_url = 'https://pic.netbian.com'
url = "https://pic.netbian.com/4kmeinv/"
if not os.path.exists("./4K图片"):
os.makedirs("./4k图片")
headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.56"
}
downlode_page(url)
with ThreadPoolExecutor(10) as t:
for i in range(2,5):
t.submit(downlode_page,f'https://pic.netbian.com/4kmeinv/index_{i}.html')
print("爬取结束!!!!")
2.3 Xpath
-
xpath 的解析原理:
- 实例化一个etree的对象,且需要将被解析的源码数据加载到对象中。
- 调用etree对象中xpath方法结合xpath表达式现实定位和内容的捕获。
-
环境安装
- pip install lxml
-
如何实例化一个etree对象。from lxml import etree
-
将本地的html文档中源码加载到etree对象中
etree.patse(filepath)
-
可以将互联网获取的源码数据加载到对象中
-
etree.HTML(‘page_text’)
- xpath 表达式
r = tree.xpath('/html/') #根据层级单位定位,返回一个列表element对象
# / 表示从根节点开始定位,一个/表示一个层级
# // 表示的是多个层级。 可以表示从任意位置开始定位。
r = tree.xpath('//div[@class="song"]')# 属性定位:tag[@attrName="attrValue" ]
r = tree.xpath('//div[@class="song"]/p[3]')# 索引定位:索引是从1开始的。
-
./局部的文本内容
-
如何取文本
-
/text() 返回文本内容(直系内容)
-
//text() 获取的是标签中非直系的文本内容
-
-
取属性
- /@attrName ====> img/@src
-
案例一58 同城 房屋信息爬取。北京二手房网,北京房产网,北京二手房买卖出售交易信息-北京58同城
import requests from lxml import etree import os if __name__ == '__main__': url = "https://bj.58.com/ershoufang/" headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.56"} page_text = requests.get(url=url,headers=headers).text tree = etree.HTML(page_text) div_list = tree.xpath('//section[@class="list"]/div') if not os.path.exists("./58同城"): os.makedirs("./58同城") fp= open("./58同城/58.txt","w",encoding="utf-8") for div in div_list: title =div.xpath('./a/div[2]//h3/text()')[0] price =''.join(div.xpath('./a/div[2]/div[2]/p/span//text()')) address = ''.join(div.xpath('./a/div[2]/div//section/div[2]//text()')) big = div.xpath('./a/div[2]/div//section/div[1]/p[2]/text()')[0].strip() fp.write(title+' '+' '+price+' '+address+' '+big+'\r\n') fp.close()
-
案例二 https://pic.netbian.com/
import requests
from lxml import etree
import os
import time
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
def downlode_page(url):
page = requests.get(url=url, headers=headers)
tree = etree.HTML(page.text)
li_list = tree.xpath('//ul[@class="clearfix"]/li')
for li in li_list:
url1 = main_url + li.xpath("./a/@href")[0]
resp = requests.get(url1, headers=headers)
resp.encoding='gbk'
child_tree = etree.HTML(resp.text)
pic2_url = child_tree.xpath('//*[@id="img"]/img/@src')[0]
title = child_tree.xpath('//*[@id="img"]/img/@title')[0]
pic_url = main_url + pic2_url
img_res = requests.get(pic_url)
# # 通用处理中文乱码解决方案
# title = title.encode('iso-8859-1').decode('gbk')
# pic = requests.get(url=pic_url,headers=headers).content
name = title +".jpg"
with open(f"./4k图片/{name}", "wb") as fp:
fp.write(img_res.content)
print(name+"over!!!")
time.sleep(3)
print("提取完毕")
if __name__ == '__main__':
main_url = 'https://pic.netbian.com'
url = "https://pic.netbian.com/4kmeinv/"
if not os.path.exists("./4K图片"):
os.makedirs("./4k图片")
headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.56"
}
downlode_page(url)
with ThreadPoolExecutor(10) as t:
for i in range(2,5):
t.submit(downlode_page,f'https://pic.netbian.com/4kmeinv/index_{i}.html')
print("爬取结束!!!!")
-
案例三 https://www.aqistudy.cn/historydata 爬取上面所有的城市。
import requests from lxml import etree import os if __name__ == '__main__': url = "https://www.aqistudy.cn/historydata/" headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.56"} page_text = requests.get(url=url,headers=headers).text tree = etree.HTML(page_text) # xpath 可以用|表示或的关系 city_list =tree.xpath('//div[@class="bottom"]/ul/li/a | //div[@class="bottom"]//div[2]/li/a ' ) fp= open("city.txt",'w',encoding='utf-8') for city in city_list: name = city.xpath('./text()')[0] fp.write(name+' ') fp.close() print("over!!!")
-
案例四 站长素材免费简历模板下载https://sc.chinaz.com/
import requests
from lxml import etree
import os
if __name__ == '__main__':
url = "https://sc.chinaz.com/jianli/free.html"
headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.56"}
page_res = requests.get(url=url,headers=headers)
page_res.encoding='utf-8'
tree = etree.HTML(page_res.text)
a_list = tree.xpath('//div[@id="main"]//div/a')
if not os.path.exists("./简历模板"):
os.makedirs("./简历模板")
for a in a_list:
jl_url = a.xpath('./@href')[0]
name = a.xpath('./img/@alt')[0]
jl_res = requests.get(url=jl_url,headers=headers)
jl_res.encoding= "utf-8"
jl_tree = etree.HTML(jl_res.text)
li_list =jl_tree.xpath('//div[@class="down_wrap"]/div[2]//li')
try:
for i in li_list:
rar_url =i.xpath('./a/@href')[0]
jl = requests.get(url=rar_url,headers=headers).content
mb_name = "./简历模板/" + name + '.rar'
with open(mb_name,"wb+") as fp:
fp.write(jl)
print(name + "下载完毕!!")
break
except:
print("错误")
print("全部完成!!!")