前言:大大小小的电商网站爬了不少。结论就是分两种类型:
第一:requests 直接获取
第二:网页动态加载,requests获取失败
直接分享代码吧
1.先导入需要的库和chromedriver的地址(爬动态加载的网页需要,若是requests可直接获取的网站可忽略)
import time,re,pandas as pd,os,requests
from selenium import webdriver
from bs4 import BeautifulSoup
CHROME_DRIVER_PATH = '/Users/xxxx/PycharmProjects/爬虫/chromedriver'
2.我先给出主函数,里面方法我会在下面贴出来
我爬的是电商网站,自然是爬去列表页的商品信息(商品描述,商品链接,商品售价,商品原价)
那么下面是爬静态网页的核心函数
#处理静态网页的
def dealSoup(now_soup,cate_name,cate_url,now_page_num):
#获取有层级的分类
cate_span_tag_list = now_soup.select('.category-breadcrumb li ')
cate_all_text = ''
for span_tag in cate_span_tag_list:
cate_all_text += f"{span_tag.text.strip()}"
#获得页数
total_page_num = 1
total_num_tag_list = now_soup.select('.site-pager li')
if len(total_num_tag_list) == 0:
pass
elif len(total_num_tag_list) == 1:
total_num_tag = total_num_tag_list[1]
total_num = extractNum(total_num_tag.text)
print(int(total_num))
total_page_num = int(total_num)
else:
total_num_tag = total_num_tag_list[-2]
total_num = extractNum(total_num_tag.text)
print(int(total_num))
total_page_num = int(total_num)
#遍历全部商品
tag_list = now_soup.select('.category-list div.item')
if len(tag_list) > 0:
print(len(tag_list))
item_list = []
for tag in tag_list:
item = {
'cate_name_all' : cate_all_text[:-1],
'cate_name' : cate_name,
'cate_url' : cate_url,
'product_now_price' : 'null',
'product_old_price' : 'null'
}
desc_tag = tag.select('.name > a')[0]
price_tag_list = tag.select('.my-shop-price')
item['product_desc'] = desc_tag.text.strip()
item['product_link'] = desc_tag.attrs['href']
if len(price_tag_list) > 0:
item['product_now_price'] = price_tag_list[0].attrs['data-oprice']
item['product_old_price'] = price_tag_l