最近对Python爬虫比较感兴趣,所有没事的时候来学习一下。
这里使用了爬虫时常用的两个库:requests 和 BeautifulSoup
requests 主要用于发送 get 和 post 请求,获取返回的数据消息。
BeautifulSoup 主要用于根据返回的HTML查询和检索数据
直接上代码吧:
import requests
import time
import os
from bs4 import BeautifulSoup
index = 0
# 得到一页的信息
def get_one_page_data(pro, page_num):
global index
# 找到一页中所有数据
url = 'http://www.kuaizhan29.com/tuku/index-' + \
str(pro) + '-0-0-0-0-' + str(page_num) + '.html'
# 发起请求
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"
}
resp = requests.get(url, headers=headers)
if resp.status_code != 200:
print("no page ")
return
# 获得请求的返回值
bs4content = (resp.content.decode('utf-8'))
soup = BeautifulSoup(bs4content, 'html.parser')
# 获取一页中所有的Imagebox
soup1 = soup.find_all('div', attrs={"class": 'imgbox'})
# print(soup1)
for item in soup1:
index += 1
soup2 = item.find_all('a')[0].attrs['href']
# print(soup2)
LoadPage(soup2, index)
pass
def LoadPage(websate, index):
resp2 = requests.get(str(websate))
# 获得请求的返回值
bs4content = (resp2.content.decode('utf-8'))
soup = BeautifulSoup(bs4content, 'html.parser')
soup1 = soup.find_all('ul', attrs={"class": 'img-list'})
# print(soup1)
id = 0
for item in soup1:
for iamgs in item.find_all('li'):
id += 1
soup2 = iamgs.find_all("img")[0].attrs['src']
DownImage(soup2, index, id, websate)
pass
def DownImage(imgepath, page, i, referer):
# 获得请求
print("page " + str(page) + " " + str(i) + " " + imgepath)
headers = {
"Referer": referer,
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"
}
resp = requests.get(imgepath, headers=headers)
if not os.path.exists('./dataImage/' + str(page)):
os.makedirs('./dataImage/' + str(page))
pass
# 打开文件
with open('./dataImage/' + str(page) + '/' + str(i) + '.jpg', 'wb') as file:
# 保存文件
file.write(resp.content)
time.sleep(0.1)
pass
for x in range(1, 5):
for y in range(1, 10):
get_one_page_data(x, y)
time.sleep(0.5)
pass
pass
这里都是比较简单和基础的用法,应该都可以看懂。有兴趣的可以试试