爬虫实战1 Requests爬取折扣信息
以爬取某国外电商网站为例:
有商品名称、价格、图片、折扣等信息。这里我们爬取所有商品、价格、折扣、链接并写入csv。
写入csv
首先准备一下写入csv的代码,逻辑是每一个商品写入csv的一行。
# 获取当前日期并格式化为字符串
current_date = datetime.now().strftime("%Y%m%d")
csv_path = f"output/spider_product_info_{current_date}.csv"
def write_csv(search_str: str,
website: str,
original_price: str,
discount_price: str,
product_name: str,
product_href: str,
spider_time: str) -> None:
"""
写入CSV的函数。接收7个字符串作为参数,并写入CSV文件。
本函数不作任何格式化处理!不要修改本函数!
参数包括:search_str 搜索关键词,website 搜索网站,original_price 原价,
discount_price 折扣价,product_name 商品名称,product_href 商品链接,spider_time 爬取时间。
如果不存在文件或文件为空,会先写入表头。"""
columns = [
"pk_id",
"search_str",
"website",
"original_price",
"discount_price",
"product_name",
"product_href",
"spider_time"
]
if not os.path.isfile(csv_path) or os.path.getsize(csv_path) == 0: \
# 文件不存在或为空时,创建并写入表头
with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile, delimiter='\u0001')
writer.writerow(columns)
with open(csv_path, 'a', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile, delimiter='\u0001')
spider_time_obj = datetime.strptime(spider_time, "%Y-%m-%d %H:%M:%S")
# 只保留到天,并将其转换为字符串格式
new_time = spider_time_obj.strftime("%Y-%m-%d")
pk_id_str = f"{website}_{product_name}_{new_time}"
pk_id = hashlib.md5(pk_id_str.encode()).hexdigest()
writer.writerow([
pk_id,
search_str,
website,
original_price,
discount_price,
product_name,
product_href,
spider_time
])
爬虫
这里我们通过一个函数实现——传入搜索词、爬虫、写入csv。传入搜索词的目的是通过复用函数实现多个关键词分别爬取。
def spider_vape_club(search_str: str):
print(f"正在处理{search_str}……")
url = f"https://www.vapeclub.co.uk/search/?sSearchString={search_str}"
response = requests.get(url, proxies=re_proxies)
soup = BeautifulSoup(response.content, 'html.parser')
finished = False
count = 0
page = 0
while not finished:
page += 1
spider_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# 产品区域
product_area = soup.find('div', id="productResults")
product_tags = product_area.find_all('div', class_="productGridItem")
for product_tag in product_tags:
discount_tag = product_tag.find('div', class_="offersDisplayGrid")
h2_tag = product_tag.find('h2')
product_href = "https://www.vapeclub.co.uk" + h2_tag.find('a')['href']
product_name = h2_tag.text.strip()
if discount_tag:
discount_price = discount_tag.text.strip()
if not discount_price:
discount_price = NO_DISCOUNT
else:
discount_price = NO_DISCOUNT
original_price_tag = product_tag.find('div', class_="productGridPrice")
original_price = original_price_tag.text.strip()
write_csv(search_str, "Vape Club", original_price, discount_price,
product_name, product_href, spider_time)
count += 1
# 该网站设置超出页上限则跳转最后一页
# 通过response返回的url的pagenumber是否一致可以判断是否到底
new_page_url = url + f"&iPageNumber={page+1}"
time.sleep(random.uniform(2, 4))
new_page_res = requests.get(new_page_url, proxies=re_proxies)
new_page_redirected_url = new_page_res.url
if new_page_redirected_url.endswith(str(page+1)):
soup = BeautifulSoup(new_page_res.content, 'html.parser')
else:
finished = True
print(f"{search_str}处理完成,共{page}页,获取结果{count}条")
该网站处理较为简单,直接通过关键词构造URL就可直接爬取。