完整代码
import json
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import ActionChains
from selenium.common.exceptions import NoSuchElementException
from lxml import etree
import re
import pandas as pd
import datetime as dt
import random
import xlwt
from selenium.webdriver.support.wait import WebDriverWait
import time
def loginTB():
browser.get(
'https://auth.alipay.com/login/index.htm?loginScene=7&goto=https%3A%2F%2Fauth.alipay.com%2Flogin%2Ftaobao_trust_login.htm%3Ftarget%3Dhttps%253A%252F%252Flogin.taobao.com%252Fmember%252Falipay_sign_dispatcher.jhtml%253Ftg%253Dhttps%25253A%25252F%25252Fwww.taobao.com%25252F¶ms=VFBMX3JlZGlyZWN0X3VybD1odHRwcyUzQSUyRiUyRnd3dy50YW9iYW8uY29tJTJG')
wait = WebDriverWait(browser, 180)
wait.until(EC.presence_of_element_located((By.ID, 'q')))
text_input = browser.find_element_by_id('q')
goods = input('请输入你想爬取的商品:')
text_input.send_keys(goods)
btn = browser.find_element_by_xpath('//*[@id="J_TSearchForm"]/div[1]/button')
btn.click()
def get_item_list(data):
xml = etree.HTML(data)
product_names = xml.xpath('//img[@class="J_ItemPic img"]/@alt')
prices = xml.xpath('//div[@class="price g_price g_price-highlight"]/strong/text()')
shop_names = xml.xpath('//div[@class="shop"]/a/span[last()]/text()')
dteail_urls = xml.xpath('//div[@class="pic"]/a/@href')
sales_volumes = xml.xpath('//div[@class="deal-cnt"]/text()')
addresss = xml.xpath('//div[@class="location"]/text()')
data_list = []
for i in range(len(product_names)):
shop_info = {}
shop_info['item_name'] = product_names[i]
shop_info['price'] = prices[i]
shop_info['shop_name'] = shop_names[i]
shop_info['salse_volume'] = sales_volumes[i]
shop_info['address'] = addresss[i]
shop_info['item_url'] = dteail_urls[i]
with open('shop_data.json', 'a', encoding='utf-8') as f:
f.write(json.dumps(shop_info, ensure_ascii=False) + '\n')
data_list.append(shop_info)
print('正在爬取第%s件商品' % (i + 1))
print('商品名称:%s' % product_names[i])
print('商品单价:%s' % prices[i])
print('店铺名称:%s' % shop_names[i])
print('累计售卖:%s' % sales_volumes[i])
print('店铺地址:%s' % addresss[i])
print('详细链接:%s' % dteail_urls[i])
print("-" * 30)
return data_list
def get_TB_data():
page_index = 1
data_list = []
max_page = int(input('输入你想爬取的页数:'))
while page_index <= max_page:
print("===================正在抓取第{}页===================".format(page_index))
print("当前页面URL:" + browser.current_url)
data_list += get_item_list(browser.page_source)
wait = WebDriverWait(browser, 60)
try:
wait.until(EC.presence_of_element_located((By.XPATH, '//a[@class="J_Ajax num icon-tag"]')))
time.sleep(1)
try:
write = browser.find_element_by_xpath('//li[@class="item next"]')
ActionChains(browser).move_to_element(write).perform()
except NoSuchElementException as e:
print("爬取完毕!")
page_index = 0
break
time.sleep(2)
webdriver.ActionChains(browser).move_to_element(write).click(write).perform()
page_index += 1
except:
print("error")
return data_list
def save_to_excel(data_list):
print('------------------------------------------------------')
print('将获取结果保存至excel:')
goods_name = input('请输入你想保存的商品名:')
workbook = xlwt.Workbook(encoding='utf-8')
sheet1 = workbook.add_sheet('{}'.format(goods_name))
sheet1.write(0, 0, '商品名称')
sheet1.write(0, 1, '商品价格')
sheet1.write(0, 2, '商品销量')
sheet1.write(0, 3, '店铺名称')
sheet1.write(0, 4, '店铺地址')
sheet1.write(0, 5, '详细链接')
sheet1.col(0).width = 256 * 30
sheet1.col(1).width = 256 * 20
sheet1.col(2).width = 256 * 10
sheet1.col(3).width = 256 * 15
sheet1.col(4).width = 256 * 15
sheet1.col(5).width = 256 * 60
sheet1_row = 0
for i in range(len(data_list)):
sheet1_row = sheet1_row + 1
sheet1.write(sheet1_row, 0, data_list[i]['item_name'])
sheet1.write(sheet1_row, 1, data_list[i]['price'])
sheet1.write(sheet1_row, 2, data_list[i]['salse_volume'])
sheet1.write(sheet1_row, 3, data_list[i]['shop_name'])
sheet1.write(sheet1_row, 4, data_list[i]['address'])
sheet1.write(sheet1_row, 5, data_list[i]['item_url'])
workbook.save('淘宝{}商品信息.xls'.format(goods_name))
print('数据保存成功')
print('------------------------------------------------------')
if __name__ == '__main__':
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = r'C:\Program Files\Google\Chrome\Application\chrome.exe'
chrome_options.add_experimental_option("excludeSwitches", ['enable-automation']);
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
browser = webdriver.Chrome(options=chrome_options)
loginTB()
data_list = get_TB_data()
save_to_excel(data_list)