爬虫工具:
爬虫的基本流程:
代码示例:
import requests
from bs4 import BeautifulSoup
import pandas as pd
base_url = 'https://www.roseonly.com.cn/list/xianhuameigui.html'
# user-agent代理 模拟浏览器请求资源
headers = {"User-Agent": "本人电脑User-Agent"}
# 连接请求
r = requests.get(base_url, headers=headers)
# 连接状态 200表示成功
# print(r.status_code)
# 查看内容
# print(r.text)
# 2.解析网页
soup = BeautifulSoup(r.content, 'lxml')
products = soup.findAll('li')
# 3.数据存储的准备
products_list = []
link_list = []
# print(products)
for product in products:
name = product.find('div', class_='pro-name')
price = product.find('div', class_='pro-price')
img = product.find('div', class_='pro-img')
link = product.find('a')
if name is not None and price is not None and img is not None and link is not None:
name = product.find('div', class_='pro-name').text
price = product.find('div', class_='pro-price').text
img = product.find('div', class_='pro-img').find('img')['src']
link = product.find('a')['href']
link_list.append(link)
product_map = {
'name': name,
'price': price,
'img': img,
'link': link
}
products_list.append(product_map)
# print(products_list)
# 数据存储
# df = pd.DataFrame(products_list)
# df.to_csv('reseonly.csv')
# print('数据存储完毕!')
detail_list = []
for detail_url in link_list:
r = requests.get(detail_url, headers=headers)
soup = BeautifulSoup(r.content, 'lxml')
product_details = soup.findAll('ul', class_='pro-attr-box')
for detail in product_details:
items = detail.findAll('li')
for item in items:
# print(item.text)
detail_list.append(item.text)
print(detail_list)