参考
- 莫烦Python爬虫 https://mofanpy.com/tutorials/data-manipulation/scraping/why
- Beautiful Soup https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/
了解网页结构
-
可以参考我blog前端的部分
- HTML https://bsheepcoder.github.io/2022/02/18/Fe_HTML1/
- CSS https://bsheepcoder.github.io/2022/02/18/Fe_CSS1/
-
大多数爬的是body内的信息
了解传输协议
数据交互的约定
http
- 常用请求头
- User-Agent:请求载体的身份标识
- Connection:请求完毕后,是断开连接还是保持连接
- 常用响应头信息
- Content-Type
https
- 安全的超文本传输协议
- 加密方式
- 对称密钥加密
- 非对称密钥加密
- 证书密钥加密
正则表达式选取
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
from urllib.request import urlopen
import re
# 打开网页获取网页的html,因为中文读取需要utf-8
html = urlopen("https://bsheepcoder.github.io/2022/02/18/Fe_CSS1/").read().decode('utf-8')
print(html)
# 正则表达式选取
res = re.findall(r"<title>(.+?)</title>", html)
print(res)
Beautiful soup
- 简化爬取的过程
- 高级匹配
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
# 打开网页获取网页的html,因为中文读取需要utf-8
html = urlopen("https://bsheepcoder.github.io/").read().decode('utf-8')
# 选择使用lxml的解析
soup = BeautifulSoup(html, "lxml")
# 在soup中找到a的对象,得到对象中的链接
for item in soup.select('a'):
detail_url = item.get('href')
detail_url = str(detail_url)
if detail_url[0:4] == "http":
print(detail_url)
# 输出
https://github.com/Bsheepcoder
https://github.com/Bsheepcoder
https://hexo.io
https://github.com/jerryc127/hexo-theme-butterfly
CSS爬取
- 通过class争对css标签选取,获取内容
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
# 打开网页获取网页的html,因为中文读取需要utf-8
html = urlopen("https://bsheepcoder.github.io/").read().decode('utf-8')
# 选择使用lxml的解析
soup = BeautifulSoup(html, "lxml")
month = soup.find_all('a', {'class': 'article-title'})
for i in month:
print(i.get_text())
suop+正则
- 爬取特定对象的所有链接
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
# 打开网页获取网页的html,因为中文读取需要utf-8
html = urlopen("https://www.csdn.net/?spm=1005.2025.3001.4476").read().decode('utf-8')
# 选择使用lxml的解析
soup = BeautifulSoup(html, "lxml")
month = soup.find_all('img', {'src': re.compile('.*?\.jpg')})
for i in month:
print(i['src'])
- 爬取特定地址的网页
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
# 打开网页获取网页的html,因为中文读取需要utf-8
html = urlopen("https://www4.bing.com/search?q=find_all").read().decode('utf-8')
# 选择使用lxml的解析
soup = BeautifulSoup(html, "lxml")
month = soup.find_all('a', {'href': re.compile('https://www.cnblogs.com/*')})
for i in month:
print(i['href'])
多功能Requests
- Post
- get
import requests
param = {"wd": '莫烦python'}
r = requests.get('http://www.baidu.com/s', params=param)
print(r.url)
import requests
data = {'username': '', 'password': ''}
r = requests.post(
'https://webvpn.scuec.edu.cn/users/sign_in', data=data
)
print(r.text)
file = {'uploadFile': open('./image.png', 'rb')}
r = requests.post(
'http://pythonscraping.com/pages/files/processing2.php', files=file
)
print(r.text)
下载图片练习
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import requests
import os
html = requests.get("URL").text
soup = BeautifulSoup(html, 'lxml')
img_ul = soup.find_all('img', {"class": "post-thumb"})
print(len(img_ul))
# 创建文件夹
os.makedirs('./img/', exist_ok=True)
# 下载
for ul in img_ul:
url = ul['src']
r = requests.get(url, stream=True)
image_name = url.split('/')[-1]
with open('./img/%s' % image_name ,'wb') as f:
for chunk in r.iter_content(chunk_size=128):
f.write(chunk)
print('Save %s' % image_name)
广泛爬图
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
import re
import requests
import os
# UA检测
# UA伪装: 在hearder中添加User-Agent
# 参数设置
# 参数设置
url1 = '主URL'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 '
'Safari/537.36 '
}
# 发起请求
response_html = requests.get(url=url1, headers=headers, verify=False).text
# print(response_html)
for i in range(2, 70):
# 创建文件夹
os.makedirs('./img/', exist_ok=True)
# 获取图片页链接
soup = BeautifulSoup(response_html, 'lxml')
img_url = soup.find_all('a', {"class": "featured-img-box"})
print("该页有链接{}个图集".format(len(img_url)))
for iu in img_url:
url_img = iu['href']
response_imgPage = requests.get(url=url_img, headers=headers).text
imgPage_soup = BeautifulSoup(response_imgPage, 'lxml')
img_link = imgPage_soup.find_all('img', {"title": "source: imgur.com"})
for il in img_link:
link = il['src']
print('图片链接:', link)
# 使用get请求,得到对应的链接的资源
if link[0:9] == 'https://i':
break
r = requests.get(link, stream=True, verify=False)
image_name = link.split('/')[-1]
with open('./img/%s' % image_name, 'wb') as f:
for chunk in r.iter_content(chunk_size=128):
f.write(chunk)
print('Save %s' % image_name)
ul2 = '子URL' + str(i)
response_html = requests.get(url=ul2, headers=headers).text
多进程分布式
- 利用多核cpu,cmd 输入 devmgmt.msc 查看cpu自己是几核的
- Python多线程有全局锁,推荐多进程