快速编写电商爬虫框架.

有了该文件,让电商爬虫像调参一样简单

spider.py

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from time import sleep
from os import path
import re
import socket
import logging
import sys
import csv
import datetime
import random

# from .ssctrl import ssThread  # ss代理,根据不同的情况修改代理池

logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    handlers=[logging.StreamHandler(sys.stdout)])


def save_data(file_path, data):
    """
    :param file_path: 文件路径
    :param data: 记录的数据
    :return: None
    """
    with open(file_path, "a+", newline='', encoding='utf-8') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(data)


class AutoGlance:
    def __init__(self, ip=None, por=1, img=True):
        """
        :param ip: 代理IP地址,默认不开启代理
        :param por: 端口号+1080
        :param img: 是否加载图片,默认加载图片
        """
        self.options = webdriver.ChromeOptions()
        self.options.add_argument('disable-infobars')
        if not img:
            prefs = {'profile.default_content_setting_values': {'images': 2}}
            self.options.add_experimental_option('prefs', prefs)
        # 代理模块,这里被注释掉
        # if ip is not None:
        #     self.ip = ip
        #     por = int(por)
        #     ss_obj = ssThread()
        #     port = ss_obj.startSS(self.ip, por)
        #     self.options.add_argument('--proxy-server=socks5://127.0.0.1:%s' % port)
        self.driver = webdriver.Chrome(chrome_options=self.options)
        self.driver.maximize_window()

    @staticmethod
    def clear_input(element):
        """
        模拟键盘删除input框中的文字
        :param element:
        :return:
        """
        for i in range(100):
            element.send_keys(Keys.BACK_SPACE)

    @staticmethod
    def is_number(s):
        try:
            float(s)
            return True
        except ValueError:
            pass
        try:
            import unicodedata
            unicodedata.numeric(s)
            return True
        except (TypeError, ValueError):
            pass
        return False

    @staticmethod
    def check_ip(ip, serv_port):
    	""""
    	该方法配合ss代理使用,检查代理IP是否有效
    	"""
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        sock.settimeout(3)
        result = sock.connect_ex((ip,  serv_port))
        sock.close()
        return result

    def send_to_input(self, text, value, key='xpath', wait=10):
    	""""该方法常用于输入搜索框,修改地址,修改购买产品数量"""
        input_elem = self.find_elements(value, keys=key, wait=wait)
        if input_elem:
            try:
                input_elem[0].send_keys(text)
                sleep(0.05)
                return True
            except Exception as e:
                print(e)
                return False

    def find_elements(self, values, keys='xpath', wait=15):
        """
        定位元素
        :param values:  str 定位元素的具体形式
        :param keys:    str 定位方式
        :param wait:    int 最多等多久,如果该元素未到就跳过
        :return: list 元素列表
        """
        try:
            if keys == 'xpath':
                element = WebDriverWait(self.driver, wait).until(
                    EC.presence_of_all_elements_located((By.XPATH, values)))
            elif keys == 'class_name':
                element = WebDriverWait(self.driver, wait).until(
                    EC.presence_of_all_elements_located((By.CLASS_NAME, values)))
            elif keys == 'id':
                element = WebDriverWait(self.driver, wait).until(
                    EC.presence_of_all_elements_located((By.ID, values)))
            elif keys == 'name':
                element = WebDriverWait(self.driver, wait).until(
                    EC.presence_of_all_elements_located((By.NAME, values)))
            elif keys == 'target':
                element = WebDriverWait(self.driver, wait).until(
                    EC.presence_of_all_elements_located((By.TAG_NAME, values)))
            elif keys == 'text':
                element = WebDriverWait(self.driver, wait).until(
                    EC.presence_of_all_elements_located((By.PARTIAL_LINK_TEXT, values)))
            else:
                element = WebDriverWait(self.driver, wait).until(
                    EC.presence_of_all_elements_located((By.CSS_SELECTOR, values)))
        except Exception as er:
            print(er)
            element = []
        return element

    # 等待页面跳转
    def wait_for_page(self, url, wait=3):
        """
        等待页面跳转到指定url
        :param url: 指定url
        :param wait: 超时时间
        :return:
        """
        time0 = datetime.datetime.now()
        while datetime.datetime.now() - time0 < datetime.timedelta(minutes=wait):
            sleep(1)
            curr_url = self.driver.current_url
            match_obj = re.search(url, curr_url, re.M | re.I)
            if match_obj:
                return
        return True

    # 关闭其他
    def close_other(self, reserve=1):
        """
        关闭其他窗口,只留下reserve窗口
        :return:
        """
        while len(self.driver.window_handles) > reserve:
            self.driver.switch_to.window(self.driver.window_handles[-1])
            self.driver.close()
        self.driver.switch_to.window(self.driver.window_handles[-1])
        sleep(0.5)

    def click_elem(self, value, key="xpath", wait=10):
        """
        点击元素
        :param value:
        :param key:
        :param wait:等待时间
        :return:
        """
        elem = self.find_elements(values=value, keys=key, wait=wait)
        if elem:
            if elem[0].get_attribute('disabled') == 'true':
                return False
            try:
                elem[0].click()
                sleep(0.05)
                return True
            except Exception as e:
                print(e, "点击失败")
                return False
        return False

    def get_text(self, value, key="xpath", wait=10):
        elem = self.find_elements(values=value, keys=key, wait=wait)
        text_list = []
        for txt in elem:
            text_list.append(txt.text)
        if len(text_list) == 0:
            text_list = ['0']
        return text_list

    def get_attribute(self, value, attr='src', key="xpath", wait=10):
        elem = self.find_elements(values=value, keys=key, wait=wait)
        text_list = []
        for txt in elem:
            text_list.append(txt.get_attribute(attr))
        if len(text_list) == 0:
            text_list = ['0']
        return text_list

    # 滚动屏幕
    def scroll(self, min_times=4, max_times=8):
        """
        随机滚动屏幕
        :param min_times:second
        :param max_times:second
        :return:
        """
        if self.is_number(min_times) and self.is_number(min_times):
            min_times = int(min_times)
            max_times = int(max_times)
            if min_times > max_times:
                times = min_times
                min_times = max_times
                max_times = times
            for i in range(random.randint(min_times, max_times)):
                js = "var q=document.documentElement.scrollTop={}".format(1000 * i)
                self.driver.execute_script(js)
                sleep(random.random())

    def browse(self, elements):
        """
        随机浏览页面
        :param elements: 待浏览网页列表
        :return:
        """
        if len(elements) == 0:
            return
        self.scroll(max_times=10)

        # 查看商品详情
        for e in range(len(elements)):
            elements[e].click()
            sleep(1)
            self.driver.switch_to.window(self.driver.window_handles[e + 1])
            self.scroll(max_times=10)
            # 切回搜索页
            self.driver.switch_to.window(self.driver.window_handles[0])
            sleep(random.random())
        self.close_other()

    def multiple_click(self, element, times=1):
        """
        多次点击某一元素
        :param element:
        :param times:
        :return:
        """
        if self.is_number(times):
            times = int(times)
            if times == 1:
                element.click()
            else:
                for i in range(times - 1):
                    element.click()
                    sleep(0.05)

    def get_url(self, url):
        self.driver.get(url)

    def ctrl_down_and_click(self, element):
        ActionChains(self.driver).key_down(Keys.CONTROL).perform()
        element.click()
        ActionChains(self.driver).key_up(Keys.CONTROL).perform()
        sleep(0.5)
        self.driver.switch_to.window(self.driver.window_handles[-1])

    def next_page(self, value, key="xpath", wait=10):
        """
        下一页
        :return: True or None
        """
        if self.click_elem(value, key, wait):
            return True

    # -----------------------------以下方法需要重写------------------------------ #
    def login(self, account, password):
        """
        登录
        :return:
        """
        pass

    def update(self):
        """
        更新数据库
        :return:
        """
        pass

    def check_abnormal_url(self):
        """
        检测异常url
        :return:
        """
        pass

    def check_account_status(self):
        """
        检测账号状态
        :return:
        """
        pass

    def check_task_status(self):
        """
        检测任务状态
        :return:
        """
        pass

    def close_mask_layer(self):
        """
        删除遮罩层
        :return:
        """

    def is_out_of_money(self):
        """
        检测是否余而不足,发送短信提醒充值,然后进入查询循环
        :return:
        """
        pass

    def handel_order(self):
        """
        处理订单
        :return:
        """
        pass

    def handle_address(self):
        """
        处理买家收货地址
        :return:
        """
        pass

    def pay(self):
        """
        付款方法
        :return:
        """
        pass

    def record(self):
        """
        记录
        :return:
        """
        pass

示例

test.py

from .spider import *
from selenium.webdriver.common.action_chains import ActionChains


class Test(AutoGlance):
    def __init__(self, data, ip=None, por=None):
        super(Test, self).__init__(ip, por)
        self.data = data

    def login(self):
        self.driver.get("https://login.aliexpress.com/buyer.htm")
        self.driver.switch_to.frame("alibaba-login-box")	# 切换到frame 
        self.send_to_input(self.data[0], "//*[@id='fm-login-id']")  # 账号
        sleep(2)
        self.send_to_input(self.data[1], "//*[@id='fm-login-password']")  # 密码
        sleep(2)
        self.click_elem("//*[@id='login-form']/div[5]/button")  # 点击登录

    def close_layer(self, wait=4):
    	"""删除优惠信息,速卖通烦人的优惠券
    	"""
        close_layer_elem = self.find_elements("//*[contains(@class,'close')]", wait=wait)
        for elem in close_layer_elem:
            try:
                elem.click()
                sleep(0.1)
            except:
                pass

    def get_categories_list(self):
        """
        导航类别
        :return:
        """
        self.wait_for_page("www.aliexpress.com")
        self.close_layer()
        categories_list = self.find_elements("//dt[@class='cate-name']//a")
        for category in categories_list:
            ActionChains(self.driver).key_down(Keys.CONTROL).perform()
            self.close_layer()
            category.click()
            ActionChains(self.driver).key_up(Keys.CONTROL).perform()
            self.driver.switch_to.window(self.driver.window_handles[-1])
            self.wait_for_page("category")
            self.get_products_list()
            self.close_other(1)

    def get_products_list(self):
        """
        点击导航进入商品列表,获取列表后点击进入商品详细页
        :return:
        """
        products_list = self.find_elements("//h3/a")
        if len(products_list) == 0:
            products_list = self.find_elements("//*[contains(@class,'item')]//div[contains(@class,'title')]/a")
        i = 0
        self.close_layer()
        self.scroll(5, 10)
        for product in products_list:
            i += 1
            print(i)
            product.click()
            self.get_product_detail()
            self.close_other(2)
        if self.next_page():
            sleep(1)
            self.get_products_list()
        else:
            return

    def get_product_detail(self):
        """
        获取页面详细页
        :return:
        """
        sleep(0.5)
        self.driver.switch_to.window(self.driver.window_handles[-1])
        self.close_layer()
        self.scroll(10, 15)
        print(self.find_elements('/html')[0].text)

    def next_page(self):
        """
        单击下一页
        :return: 布尔值,根据布尔值判断是否有下一页
        """
        print("next page")
        # if self.click_elem("//*[text()='Next']"):
        #     return True
        # else:
        #     return False
        next_elem = self.find_elements("//*[@class='page-next ui-pagination-next']", wait=3)
        if len(next_elem) == 0:
            next_elem = self.find_elements("//*[contains(@class,'next-pagination-item next-next')]", wait=3)
        if len(next_elem) == 0:
            return False
        try:
            next_elem[0].click()
            sleep(0.5)
            return True
        except Exception as e:
            print(e)
            return False
	def main(self):
		"""
		将动作步骤组装起来,比如先登录,那就self.login()
		然后就是获取平台商品品类
		"""
		pass


if __name__ == '__main__':
    datas = ['qq123456789@gmail.com', '0123456789']    # 放心,账户,密码肯定是加的
    test = Test(data=datas)
    test.login()
    test.get_categories_list()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值