1、下载文件
1.1 requests
pip install requests
import requests
photo = requests.get(url)
with open(file_name, 'wb') as f:
f.write(photo.content)
1.2 wget
pip install wget
import wget
wget.download(url, path)
可能遇到的报错:
urllib.error.URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:997)>
在代码前加入如下代码:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
其他解决方案:【Python】urllib访问https出现SSL证书问题
2、爬取图片
思路:找到网站获取图片列表的请求,挨个发起请求下载图片。
安装第三方库:
pip install wget beautifulsoup4 requests tqdm
以壁纸汇为例:
import ssl
from urllib.error import HTTPError
import requests
import tqdm
import wget
from bs4 import BeautifulSoup
def main():
# 注意修改URL,其中1为页码,页大小固定为20
url = 'https://www.bzh.com/renwu/1/?order=views'
response = requests.get(url)
if response.status_code == 200:
root = BeautifulSoup(response.text, 'html.parser')
images = root.select('img')
if images:
images = images[1:]
for image in tqdm.tqdm(images):
try:
ssl._create_default_https_context = ssl._create_unverified_context
wget.download(image.get('src')[:-9], '../images')
except HTTPError as e:
print(e)
if __name__ == '__main__':
main()
下载电脑横屏尺寸:
import ssl
from urllib.error import HTTPError
import requests
import tqdm
import wget
from bs4 import BeautifulSoup
def main():
url = 'https://www.bzh.com/dongman/1/?order=time'
response = requests.get(url)
if response.status_code == 200:
root = BeautifulSoup(response.text, 'html.parser')
images = root.select('a.item-img')
if images:
for image in tqdm.tqdm(images):
response = requests.get(image['href'])
if response.status_code == 200:
root = BeautifulSoup(response.text, 'html.parser')
pic = root.select_one('div.shadow-list a')
try:
ssl._create_default_https_context = ssl._create_unverified_context
wget.download(pic['href'], '../images')
except HTTPError as e:
print(e)
if __name__ == '__main__':
main()
多线程下载:
import ssl
from concurrent.futures import ThreadPoolExecutor
from urllib.error import HTTPError
import requests
import wget
from bs4 import BeautifulSoup
def download(i):
print(f'第 {i} 次循环')
url = f'https://www.bizhihui.com/dongman/{i}/?order=views'
response = requests.get(url)
if response.status_code == 200:
root = BeautifulSoup(response.text, 'html.parser')
images = root.select('a.item-img')
if images:
for image in images:
response = requests.get(image['href'])
if response.status_code == 200:
root = BeautifulSoup(response.text, 'html.parser')
pic = root.select_one('div.shadow-list a')
try:
ssl._create_default_https_context = ssl._create_unverified_context
wget.download(pic['href'], '../images')
except HTTPError as e:
print(e)
def main():
with ThreadPoolExecutor(10) as t:
for i in range(1, 30):
t.submit(download, i)
if __name__ == '__main__':
main()
3、爬虫通过验证码
- 第一步,将验证码下载至本地
# 获取验证码界面html url = 'http://www.example.com/a.html' resp = requests.get(url) soup = BeautifulSoup(resp.content.decode('UTF-8'), 'html.parser') #找到验证码图片标签,获取其地址 src = soup.select_one('div.captcha-row img')['src'] # 验证码下载至本地 resp = requests.get(src) with open('../images/verify.png', 'wb') as f: f.write(resp.content) - 第二步,解析验证码
pip install ddddocrocr = ddddocr.DdddOcr() with open('../images/verify.png', 'rb') as f: img = f.read() code = ocr.classification(img) print(code) - 第三步,发送验证码
#获取 token,一般验证码框有个隐藏的token token = soup.find('input', {'name': 'csrfToken'}).get('value') # 提交按钮对应的URL verify_url = 'https://www.example.com/verify' # 表单数据具体有哪几项可以在界面提交时查看(F12) data = { 'vcode': code, 'token': token, 'btnPost':'' } # 请求头(F12 从请求里扒) headers = { 'content-type': 'application/x-www-form-urlencoded', 'user-agent': 'Mozilla/5.0 (Macintosh;) AppleWebKit/537.36 (KHTML, like Gecko) Chrome' } response = requests.post(verify_url, data=data, headers=headers) if response.status_code == 200: print('人机验证 - success') else: print('人机验证 - fail')
4、爬虫使用IP代理池
4.1 代理池
IP 代理池可以理解为一个池子,里面装了很多代理IP。
- 池子里的IP是有生命周期的,它们将被定期验证,其中失效的将被从池子里面剔除
- 池子里的ip是有补充渠道的,会有新的代理ip不断被加入池子中
- 池子中的代理ip是可以被随机取出的
4.2 为什么要用代理池
一些大型的网站(尤其是电商类网站),为了禁止爬虫获取数据,会采取限制同一个IP地址的网络请求数量、请求频率等方式,进行网站反扒。
而使用IP代理池后的爬虫,就可以隐藏自己IP,并且随机更换请求时的IP地址,绕过该反爬机制,快速获取大量数据。

4.3 代理IP获取
一些网站提供免费的代理IP,这个不稳定,并且很多都不能用。
收费的网站(如需求量不大,白嫖注册赠送的 IP 即可):
4.4 代理IP使用
获取代理IP(以巨量IP为例):


代码:
api_url = 'http://v2.api.juliangip.com/dynamic/getips...'
username = '13255667788'
password = '123456'
proxy_ip_list = requests.get(api_url).json()['data']['proxy_list']
for proxy_ip in proxy_ip_list:
proxies = {
"http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": proxy_ip},
"https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": proxy_ip},
}
target_url = 'https://myip.ipip.net/'
resp = requests.get(target_url, proxies=proxies)
print(f'{proxy_ip} ------ {resp.text}')
5、进度条
5.1 Tqdm
pip install tqdm
import wget
from tqdm import tqdm
for url in tqdm(urlList):
wget.download(url, path)
4939

被折叠的 条评论
为什么被折叠?



