import requests
from bs4 import BeautifulSoup
url = 'http://www.bookschina.com/book_find2/default.aspx?stp=python&scate=0&f=1&sort=0&asc=0&sh=0&so=1&p=2&pb=1'
res = requests.get(url)
#print(res)
html = res.text
#抓取
def spider():
soup = BeautifulSoup(html,'lxml')
#print(soup)
all_list = soup.find_all('div',class_='infor')
print(all_list)
if __name__ == '__main__':
spider()
代码
```python
import os
import requests
from bs4 import BeautifulSoup
import queue
import time
import threading
#抓取
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36'
}
page_url = 'http://www.bookschina.com/fivestar/default.aspx?&page='
#queeue线程
url_queue = queue.Queue()
#分页函数
- [ ] List item
def page():
for i in range(3):
url = page_url + str(i)
print(f"开始爬取第{i}页")
print(f'\n{url}')
url_queue.put(url)
#抓取函数
def spider(url_queue):
try:
url = url_queue.get() # 从队列中取出最前面的url
res = requests.get(url=url, headers=headers)
# print(res)
html = res.text
soup = BeautifulSoup(html, 'lxml')
# print(soup)
all_list = soup.find_all('div', class_='infor')
for all in all_list:
# 图书名字
name = all.find('h2', class_='name').get_text()
#print(name)
#作者
author = all.find('div',class_='author').get_text()
#print(author)
price = all.find('span', class_='sellPrice').get_text().replace('¥', '')
#print(price)
# # 折扣
count = all.find('span', class_='discount').get_text().replace('(', '').replace(')', '')
#print(count)
# 定价
pric1 = all.find('span', class_='priceTit').get_text()
# print(pric1)
pric2 = all.find('del').get_text().replace('¥', '')
# print(pric2)
price1 = pric1 + pric2
#print(price1)
list_name = name + author + price + count + price1
# 新建一个文件夹保存所有图片
if not os.path.exists('./tushu/'):
os.mkdir('./tushu/')
#文件存储路径
imgpath = './tushu/' + name + '.txt'
with open(imgpath, 'w') as f:
f.write(f'{list_name}\n')
print(name, '下载成功')
if not url_queue.empty():
spider(url_queue)
pass
except:
pass
#主函数
def main():
page()
queue_list = []
t = threading.Thread(target=spider, args=(url_queue,))
t.start()
queue_list.append(t)
for t in queue_list:
t.join()
#print('线程完毕')
pass
if __name__ == '__main__':
ted = time.time()
main()
end = time.time()
#print(f'一共下载{end - ted}秒')
queue线程参数
初始化queue(maxsize):创建一个先进先出的的队列
1.qsize()#返回队列的大小
2.empty():判断队列是否为空
3.full():判断队列是否满了
4.get():从对列中取最后一个数据
5.put():将一个数据放在队列中
参考文章
https://www.cnblogs.com/REN-Murphy/p/14706686.html