有了该文件,让电商爬虫像调参一样简单
spider.py
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from time import sleep
from os import path
import re
import socket
import logging
import sys
import csv
import datetime
import random
# from .ssctrl import ssThread # ss代理,根据不同的情况修改代理池
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
handlers=[logging.StreamHandler(sys.stdout)])
def save_data(file_path, data):
"""
:param file_path: 文件路径
:param data: 记录的数据
:return: None
"""
with open(file_path, "a+", newline='', encoding='utf-8') as csv_file:
csv_writer = csv.writer(csv_file)
csv_writer.writerow(data)
class AutoGlance:
def __init__(self, ip=None, por=1, img=True):
"""
:param ip: 代理IP地址,默认不开启代理
:param por: 端口号+1080
:param img: 是否加载图片,默认加载图片
"""
self.options = webdriver.ChromeOptions()
self.options.add_argument('disable-infobars')
if not img:
prefs = {'profile.default_content_setting_values': {'images': 2}}
self.options.add_experimental_option('prefs', prefs)
# 代理模块,这里被注释掉
# if ip is not None:
# self.ip = ip
# por = int(por)
# ss_obj = ssThread()
# port = ss_obj.startSS(self.ip, por)
# self.options.add_argument('--proxy-server=socks5://127.0.0.1:%s' % port)
self.driver = webdriver.Chrome(chrome_options=self.options)
self.driver.maximize_window()
@staticmethod
def clear_input(element):
"""
模拟键盘删除input框中的文字
:param element:
:return:
"""
for i in range(100):
element.send_keys(Keys.BACK_SPACE)
@staticmethod
def is_number(s):
try:
float(s)
return True
except ValueError:
pass
try:
import unicodedata
unicodedata.numeric(s)
return True
except (TypeError, ValueError):
pass
return False
@staticmethod
def check_ip(ip, serv_port):
""""
该方法配合ss代理使用,检查代理IP是否有效
"""
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(3)
result = sock.connect_ex((ip, serv_port))
sock.close()
return result
def send_to_input(self, text, value, key='xpath', wait=10):
""""该方法常用于输入搜索框,修改地址,修改购买产品数量"""
input_elem = self.find_elements(value, keys=key, wait=wait)
if input_elem:
try:
input_elem[0].send_keys(text)
sleep(0.05)
return True
except Exception as e:
print(e)
return False
def find_elements(self, values, keys='xpath', wait=15):
"""
定位元素
:param values: str 定位元素的具体形式
:param keys: str 定位方式
:param wait: int 最多等多久,如果该元素未到就跳过
:return: list 元素列表
"""
try:
if keys == 'xpath':
element = WebDriverWait(self.driver, wait).until(
EC.presence_of_all_elements_located((By.XPATH, values)))
elif keys == 'class_name':
element = WebDriverWait(self.driver, wait).until(
EC.presence_of_all_elements_located((By.CLASS_NAME, values)))
elif keys == 'id':
element = WebDriverWait(self.driver, wait).until(
EC.presence_of_all_elements_located((By.ID, values)))
elif keys == 'name':
element = WebDriverWait(self.driver, wait).until(
EC.presence_of_all_elements_located((By.NAME, values)))
elif keys == 'target':
element = WebDriverWait(self.driver, wait).until(
EC.presence_of_all_elements_located((By.TAG_NAME, values)))
elif keys == 'text':
element = WebDriverWait(self.driver, wait).until(
EC.presence_of_all_elements_located((By.PARTIAL_LINK_TEXT, values)))
else:
element = WebDriverWait(self.driver, wait).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, values)))
except Exception as er:
print(er)
element = []
return element
# 等待页面跳转
def wait_for_page(self, url, wait=3):
"""
等待页面跳转到指定url
:param url: 指定url
:param wait: 超时时间
:return:
"""
time0 = datetime.datetime.now()
while datetime.datetime.now() - time0 < datetime.timedelta(minutes=wait):
sleep(1)
curr_url = self.driver.current_url
match_obj = re.search(url, curr_url, re.M | re.I)
if match_obj:
return
return True
# 关闭其他
def close_other(self, reserve=1):
"""
关闭其他窗口,只留下reserve窗口
:return:
"""
while len(self.driver.window_handles) > reserve:
self.driver.switch_to.window(self.driver.window_handles[-1])
self.driver.close()
self.driver.switch_to.window(self.driver.window_handles[-1])
sleep(0.5)
def click_elem(self, value, key="xpath", wait=10):
"""
点击元素
:param value:
:param key:
:param wait:等待时间
:return:
"""
elem = self.find_elements(values=value, keys=key, wait=wait)
if elem:
if elem[0].get_attribute('disabled') == 'true':
return False
try:
elem[0].click()
sleep(0.05)
return True
except Exception as e:
print(e, "点击失败")
return False
return False
def get_text(self, value, key="xpath", wait=10):
elem = self.find_elements(values=value, keys=key, wait=wait)
text_list = []
for txt in elem:
text_list.append(txt.text)
if len(text_list) == 0:
text_list = ['0']
return text_list
def get_attribute(self, value, attr='src', key="xpath", wait=10):
elem = self.find_elements(values=value, keys=key, wait=wait)
text_list = []
for txt in elem:
text_list.append(txt.get_attribute(attr))
if len(text_list) == 0:
text_list = ['0']
return text_list
# 滚动屏幕
def scroll(self, min_times=4, max_times=8):
"""
随机滚动屏幕
:param min_times:second
:param max_times:second
:return:
"""
if self.is_number(min_times) and self.is_number(min_times):
min_times = int(min_times)
max_times = int(max_times)
if min_times > max_times:
times = min_times
min_times = max_times
max_times = times
for i in range(random.randint(min_times, max_times)):
js = "var q=document.documentElement.scrollTop={}".format(1000 * i)
self.driver.execute_script(js)
sleep(random.random())
def browse(self, elements):
"""
随机浏览页面
:param elements: 待浏览网页列表
:return:
"""
if len(elements) == 0:
return
self.scroll(max_times=10)
# 查看商品详情
for e in range(len(elements)):
elements[e].click()
sleep(1)
self.driver.switch_to.window(self.driver.window_handles[e + 1])
self.scroll(max_times=10)
# 切回搜索页
self.driver.switch_to.window(self.driver.window_handles[0])
sleep(random.random())
self.close_other()
def multiple_click(self, element, times=1):
"""
多次点击某一元素
:param element:
:param times:
:return:
"""
if self.is_number(times):
times = int(times)
if times == 1:
element.click()
else:
for i in range(times - 1):
element.click()
sleep(0.05)
def get_url(self, url):
self.driver.get(url)
def ctrl_down_and_click(self, element):
ActionChains(self.driver).key_down(Keys.CONTROL).perform()
element.click()
ActionChains(self.driver).key_up(Keys.CONTROL).perform()
sleep(0.5)
self.driver.switch_to.window(self.driver.window_handles[-1])
def next_page(self, value, key="xpath", wait=10):
"""
下一页
:return: True or None
"""
if self.click_elem(value, key, wait):
return True
# -----------------------------以下方法需要重写------------------------------ #
def login(self, account, password):
"""
登录
:return:
"""
pass
def update(self):
"""
更新数据库
:return:
"""
pass
def check_abnormal_url(self):
"""
检测异常url
:return:
"""
pass
def check_account_status(self):
"""
检测账号状态
:return:
"""
pass
def check_task_status(self):
"""
检测任务状态
:return:
"""
pass
def close_mask_layer(self):
"""
删除遮罩层
:return:
"""
def is_out_of_money(self):
"""
检测是否余而不足,发送短信提醒充值,然后进入查询循环
:return:
"""
pass
def handel_order(self):
"""
处理订单
:return:
"""
pass
def handle_address(self):
"""
处理买家收货地址
:return:
"""
pass
def pay(self):
"""
付款方法
:return:
"""
pass
def record(self):
"""
记录
:return:
"""
pass
示例
test.py
from .spider import *
from selenium.webdriver.common.action_chains import ActionChains
class Test(AutoGlance):
def __init__(self, data, ip=None, por=None):
super(Test, self).__init__(ip, por)
self.data = data
def login(self):
self.driver.get("https://login.aliexpress.com/buyer.htm")
self.driver.switch_to.frame("alibaba-login-box") # 切换到frame
self.send_to_input(self.data[0], "//*[@id='fm-login-id']") # 账号
sleep(2)
self.send_to_input(self.data[1], "//*[@id='fm-login-password']") # 密码
sleep(2)
self.click_elem("//*[@id='login-form']/div[5]/button") # 点击登录
def close_layer(self, wait=4):
"""删除优惠信息,速卖通烦人的优惠券
"""
close_layer_elem = self.find_elements("//*[contains(@class,'close')]", wait=wait)
for elem in close_layer_elem:
try:
elem.click()
sleep(0.1)
except:
pass
def get_categories_list(self):
"""
导航类别
:return:
"""
self.wait_for_page("www.aliexpress.com")
self.close_layer()
categories_list = self.find_elements("//dt[@class='cate-name']//a")
for category in categories_list:
ActionChains(self.driver).key_down(Keys.CONTROL).perform()
self.close_layer()
category.click()
ActionChains(self.driver).key_up(Keys.CONTROL).perform()
self.driver.switch_to.window(self.driver.window_handles[-1])
self.wait_for_page("category")
self.get_products_list()
self.close_other(1)
def get_products_list(self):
"""
点击导航进入商品列表,获取列表后点击进入商品详细页
:return:
"""
products_list = self.find_elements("//h3/a")
if len(products_list) == 0:
products_list = self.find_elements("//*[contains(@class,'item')]//div[contains(@class,'title')]/a")
i = 0
self.close_layer()
self.scroll(5, 10)
for product in products_list:
i += 1
print(i)
product.click()
self.get_product_detail()
self.close_other(2)
if self.next_page():
sleep(1)
self.get_products_list()
else:
return
def get_product_detail(self):
"""
获取页面详细页
:return:
"""
sleep(0.5)
self.driver.switch_to.window(self.driver.window_handles[-1])
self.close_layer()
self.scroll(10, 15)
print(self.find_elements('/html')[0].text)
def next_page(self):
"""
单击下一页
:return: 布尔值,根据布尔值判断是否有下一页
"""
print("next page")
# if self.click_elem("//*[text()='Next']"):
# return True
# else:
# return False
next_elem = self.find_elements("//*[@class='page-next ui-pagination-next']", wait=3)
if len(next_elem) == 0:
next_elem = self.find_elements("//*[contains(@class,'next-pagination-item next-next')]", wait=3)
if len(next_elem) == 0:
return False
try:
next_elem[0].click()
sleep(0.5)
return True
except Exception as e:
print(e)
return False
def main(self):
"""
将动作步骤组装起来,比如先登录,那就self.login()
然后就是获取平台商品品类
"""
pass
if __name__ == '__main__':
datas = ['qq123456789@gmail.com', '0123456789'] # 放心,账户,密码肯定是加的
test = Test(data=datas)
test.login()
test.get_categories_list()