import requests
from bs4 import BeautifulSoup
url = 'http://category.dangdang.com/pg1-cp01.01.02.00.00.00.html'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0'
}
res = requests.get(url, headers=headers)
result = res.text
soup = BeautifulSoup(result, 'lxml')
list1 = soup.select('#component_59 li')
c = 0
for li in list1:
book_name = li.select('.name')[0].text
book_detail = li.select('.detail')[0].text
book_price = li.select('.price')[0].text
book_search_star_black = li.select('.search_star_black')[0].text
book_search_book_author = li.select('.search_book_author')[0].text
c += 1
print(c, book_name, book_detail, book_price, book_search_star_black, book_search_book_author)
'''
一些扩展 图片懒加载
'''
"""
参考答案
当当网:http://category.dangdang.com/pg1-cp01.01.02.00.00.00.html
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0'
}
url = 'http://category.dangdang.com/pg1-cp01.01.02.00.00.00.html'
1.爬取数据
res = requests.get(url, headers=headers)
result = res.text
print(result)
2.解析html
soup = BeautifulSoup(result, 'lxml')
li_list = soup.select('#component_59 li')
for i, li in enumerate(li_list):
# 标题
title = li.select_one('.a').attrs['title']
# 价格
price = li.select_one('.search_now_price').text
# 图片
if i == 0:
img = li.select_one('img').attrs['src']
else:
img = li.select_one('img').attrs['data-original']
print(img)
'''
更便捷的方式
img = li.select_one('img').attrs['src']
if i != 0:
img = li.select_one('img').attrs['data-original']
更便捷的方式二:字典取值
img = li.select_one('img').attrs.get('data-original',li.select_one('img').attrs['src'])
更便捷的方式三:or
img = li.select_one('img').attrs.get('data-original') or li.select_one('img').attrs