店铺卖家的描述信息爬取

1数据库



"""
使用须知:
代码中数据表名 mirror_desc ,需要更改该数据表名称的注意更改


"""
import pymysql

class Database():
    # 设置本地数据库用户名和密码
    host = "localhost"
    user = "root"
    password = "***"
    database = "test"
    port = 3306
    charset = "utf8"
    cursor=''
    connet =''
    def __init__(self):
        #连接到数据库
        self.connet = pymysql.connect(host = self.host , user = self.user,password = self.password , database = self.database, charset = self.charset)
        self.cursor = self.connet.cursor()
    # #删表
    def dropTables(self):
        self.cursor.execute('''drop table if exists mirror_desc''')
        print("删表")
    #建表
    def createTables(self):
        self.cursor.execute('''create table if not exists mirror_desc
                        ( 
                            ebayno    varchar(200) primary key not null,
                            item_specifications varchar(2000));''')
        print("建表")
    #保存数据
    def save(self,aceslist):
        self.cursor.execute("insert into mirror_desc ( ebayno, item_specifications) values(%s,%s)", (aceslist[0],aceslist[1]))
        self.connet.commit()
    #判断元素是否已经在数据库里,在就返回true ,不在就返回false
    def is_exists_ebayno(self,ebayno):
        self.cursor.execute('select * from mirror_desc where ebayno = %s',ebayno)
        if self.cursor.fetchone() is None:
            return False
        return True


# db =Database()


"""
使用须知:
代码中数据表名 mirror_desc_carpart ,需要更改该数据表名称的注意更改


"""
import pymysql

class Database_carpart():
    # 设置本地数据库用户名和密码
    host = "localhost"
    user = "root"
    password = "***"
    database = "test"
    port = 3306
    charset = "utf8"
    cursor=''
    connet =''
    def __init__(self):
        #连接到数据库
        self.connet = pymysql.connect(host = self.host , user = self.user,password = self.password , database = self.database, charset = self.charset)
        self.cursor = self.connet.cursor()
    # #删表
    def dropTables(self):
        self.cursor.execute('''drop table if exists mirror_desc_carpart''')
        print("删表")
    #建表
    def createTables(self):
        self.cursor.execute('''create table if not exists mirror_desc_carpart
                        ( 
                            ebayno    varchar(200) primary key not null,
                            Signal varchar(200),
                            Folding varchar(200),
                            Heated varchar(200),
                            Towing varchar(200),
                            operation varchar(200));''')
        print("建表")
    #保存数据
    def save(self,aceslist):
        self.cursor.execute("insert into mirror_desc_carpart ( ebayno,Signal,Folding,Heated,Towing,operation) values(%s,%s,%s,%s,%s)",(aceslist[0],aceslist[1],aceslist[2],aceslist[3],aceslist[4]))
        self.connet.commit()
    #判断元素是否已经在数据库里,在就返回true ,不在就返回false
    def is_exists_ebayno(self,ebayno):
        self.cursor.execute('select * from mirror_desc_carpart where ebayno = %s',ebayno)
        if self.cursor.fetchone() is None:
            return False
        return True


# db =Database()

"""
使用须知:
代码中数据表名 mirror_desc_martins ,需要更改该数据表名称的注意更改 mirror_desc_martins


"""
import pymysql

class Database_martins():
    # 设置本地数据库用户名和密码
    host = "localhost"
    user = "root"
    password = "****"
    database = "test"
    port = 3306
    charset = "utf8"
    cursor=''
    connet =''
    def __init__(self):
        #连接到数据库
        self.connet = pymysql.connect(host = self.host , user = self.user,password = self.password , database = self.database, charset = self.charset)
        self.cursor = self.connet.cursor()
    # #删表
    def dropTables(self):
        self.cursor.execute('''drop table if exists mirror_desc_martins''')
        print("删表")
    #建表
    def createTables(self):
        self.cursor.execute('''create table if not exists mirror_desc_martins
                        ( 
                            ebayno    varchar(200) primary key not null,
                            Signal varchar(200),
                            Folding varchar(200),
                            Heated varchar(200),
                            Towing varchar(200),
                            operation varchar(200));''')
        print("建表")
    #保存数据
    def save(self,aceslist):
        self.cursor.execute("insert into mirror_desc_martins ( ebayno,Signal,Folding,Heated,Towing,operation) values(%s,%s,%s,%s,%s,%s)",(aceslist[0],aceslist[1],aceslist[2],aceslist[3],aceslist[4],aceslist[5]))
        self.connet.commit()
    #判断元素是否已经在数据库里,在就返回true ,不在就返回false
    def is_exists_ebayno(self,ebayno):
        self.cursor.execute('select * from mirror_desc_martins where ebayno = %s',ebayno)
        if self.cursor.fetchone() is None:
            return False
        return True


# db =Database()


"""
使用须知:
代码中数据表名 mirror_desc_primechoice ,需要更改该数据表名称的注意更改 mirror_desc_primechoice


"""
import pymysql

class Database_primechoice():
    # 设置本地数据库用户名和密码
    host = "localhost"
    user = "root"
    password = "****"
    database = "test"
    port = 3306
    charset = "utf8"
    cursor=''
    connet =''
    def __init__(self):
        #连接到数据库
        self.connet = pymysql.connect(host = self.host , user = self.user,password = self.password , database = self.database, charset = self.charset)
        self.cursor = self.connet.cursor()
    # #删表
    def dropTables(self):
        self.cursor.execute('''drop table if exists mirror_desc_primechoice''')
        print("删表")
    #建表
    def createTables(self):
        self.cursor.execute('''create table if not exists mirror_desc_primechoice
                        ( 
                            ebayno   varchar(200) primary key not null,
                            Notes varchar(2000),
                            Power varchar(200),
                            Heated varchar(200));''')
        print("建表")
    #保存数据
    def save(self,aceslist):
        self.cursor.execute("insert into mirror_desc_primechoice ( ebayno,Notes,Power,Heated) values(%s,%s,%s,%s)",(aceslist[0],aceslist[1],aceslist[2],aceslist[3]))
        self.connet.commit()
    #判断元素是否已经在数据库里,在就返回true ,不在就返回false
    def is_exists_ebayno(self,ebayno):
        self.cursor.execute('select * from mirror_desc_primechoice where ebayno = %s',ebayno)
        if self.cursor.fetchone() is None:
            return False
        return True


# db =Database()



2函数体

"""
不同店铺的数据存储方式不一致所以有多个数据库,需要随时更改对应的数据库

"""
import urllib.parse
import urllib.parse
import urllib.request
from queue import Queue
import time
import random
import threading
import logging
import pymysql
import re
from bs4 import BeautifulSoup
from local_data import Database
from local_data_carpartswholesale import Database_carpart
from local_data_primechoice import Database_primechoice
from local_data_martins import Database_martins
#一个模块中存储多个类 AmazonSpeder ,  ThreadCrawl(threading.Thread), AmazonSpiderJob
class AmazonSpider():

    def __init__(self):
        self.db = Database()
        # self.db = Database_carpart()
        # self.db = Database_primechoice()
        # self.db = Database_martins()


    def randHeader(self):
        head_connection = ['Keep-Alive', 'close']
        head_accept = ['text/html, application/xhtml+xml, */*']
        head_accept_language = ['zh-CN,fr-FR;q=0.5', 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3']
        head_user_agent = ['Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
                           'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
                           'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
                           'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
                           'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
                           'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
                           'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
                           'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
                           'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
                           'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
                           'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
                           'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
                           'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
                           'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
                           'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
                           'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
                           'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
                           'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
                           'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']

        header = {
            'Connection': head_connection[0],
            'Accept': head_accept[0],
            'Accept-Language': head_accept_language[1],
            'User-Agent': head_user_agent[random.randrange(0, len(head_user_agent))]
        }
        return header

    def getdesc_am_autoparts(self,ebayno):
        ebayno = str(ebayno)
        # 如果数据库中有的数据,直接返回不处理
        if self.db.is_exists_ebayno(ebayno):
            return
        url = "http://vi.vipr.ebaydesc.com/ws/eBayISAPI.dll?ViewItemDescV4&item=" + ebayno
        # webbrowser.open(url)
        req = urllib.request.Request(url=url, headers=self.randHeader())
        webpage = urllib.request.urlopen(req)
        html = webpage.read()
        soup = BeautifulSoup(html, 'html.parser')  # 解析
        content = soup.find_all("div", class_="item_specifications")  # 选取desc部分 am-autoparts店铺
        if len(content) == 0:
            self.db.save([ebayno, ""])  # 保存到数据库
            return
        strings = []
        for string in content[0].stripped_strings:
            strings.append(string)
        strings = "|".join(strings)
        self.db.save([ebayno, strings])  # 保存到数据库

    def getdesc_carpartswholesale(self,ebayno):
        ebayno = str(ebayno)
        # 如果数据库中有的数据,直接返回不处理
        if self.db.is_exists_ebayno(ebayno):
            return
        url = "http://vi.vipr.ebaydesc.com/ws/eBayISAPI.dll?ViewItemDescV4&item=" + ebayno
        # webbrowser.open(url)
        req = urllib.request.Request(url=url, headers=self.randHeader())
        webpage = urllib.request.urlopen(req)
        html = webpage.read()
        soup = BeautifulSoup(html, 'html.parser')  # 解析
        # print(soup.prettify())
        content = soup.find_all("tr", class_="paBottomBorder")  # 定位表格数据
        item = {}  # 用字典存储表格数据
        for i in content:
            si = i.find_all("td")
            key = ""
            values = ""
            for string in si[0].stripped_strings:
                key = string
            for string in si[1].stripped_strings:
                values = string
            if (key == "Folding type") or (key == "Heated") or (key == "Towing mirror") or (key == "Glass operation"):
                item[key] = values
        self.db.save( [ebayno, item.setdefault("Folding type", None),item.setdefault("Heated", None),item.setdefault("Towing mirror", None),item.setdefault("Glass operation", None)])  # 保存到数据库
    def getdesc_everyday(self,ebayno):
        ebayno = str(ebayno)
        # 如果数据库中有的数据,直接返回不处理
        if self.db.is_exists_ebayno(ebayno):
            return
        url = "http://vi.vipr.ebaydesc.com/ws/eBayISAPI.dll?ViewItemDescV4&item=" + ebayno
        # webbrowser.open(url)
        req = urllib.request.Request(url=url, headers=self.randHeader())
        webpage = urllib.request.urlopen(req)
        html = webpage.read()
        soup = BeautifulSoup(html, 'html.parser')  # 解析
        content = soup.find_all("h3", text="Features")  # 定位表格数据
        if len(content) == 0:
            self.db.save([ebayno, "","",""])  # 保存到数据库
            return
        ul = content[0].find_next_sibling("ul")
        strings = []
        for string in ul.stripped_strings:
            strings.append(string)
        strings = "|".join(strings)
        self.db.save([ebayno, strings])  # 保存到数据库

    def getdesc_primechoice(self, ebayno):
        ebayno = str(ebayno)
        # 如果数据库中有的数据,直接返回不处理
        if self.db.is_exists_ebayno(ebayno):
            return
        url = "http://vi.vipr.ebaydesc.com/ws/eBayISAPI.dll?ViewItemDescV4&item=" + ebayno
        # webbrowser.open(url)
        req = urllib.request.Request(url=url, headers=self.randHeader())
        webpage = urllib.request.urlopen(req)
        html = webpage.read()
        soup = BeautifulSoup(html, 'html.parser')  # 解析
        content = soup.find_all("table", class_="fitment")  # 定位notes信息
        if len(content) == 0:
            self.db.save([ebayno, "", "", ""])  # 保存到数据库
            return
        htr = content[0].find_all("th")
        hstr = [i.string for i in htr]
        trow1 = content[0].find_all("tr")[1]
        tdstr = [i.string for i in trow1.find_all("td")]
        item = {}
        for i in range(len(tdstr)):
            item[hstr[i]] = tdstr[i]
        notes = item.setdefault("Notes", None)
        # 定位product details信息
        powervalue=""
        power = soup.find_all("td", text="Power") # 标签加字符串
        if len(power) != 0:
            powervalue = power[0].find_next_sibling().string
        item["power"] = powervalue
        heatedvalue=""
        heated = soup.find_all("td", text="Heated")
        if len(heated)!=0:
            heatedvalue = heated[0].find_next_sibling().string
        item["heated"] = heatedvalue
        powerstr = item.setdefault("power", None)
        heatedstr = item.setdefault("heated", None)
        self.db.save([ebayno, notes,powerstr,heatedstr])  # 保存到数据库
    def getdesc_martins_bold(self,soup,bold):
        boldstr = ""
        strongs = soup.find("strong", text=bold) # 标签加文本
        reg = str(strongs) + '(.*?)' + "<br/>"
        signalstr = re.findall(reg, str(soup), re.S) #正则表达式
        if len(signalstr) != 0:
            signalstr = signalstr[0]
            boldstr = str(signalstr).strip()  # 删除前后空格
        return boldstr
    def getdesc_martins(self, ebayno):
        ebayno = str(ebayno)
        # 如果数据库中有的数据,直接返回不处理
        if self.db.is_exists_ebayno(ebayno):
            return
        url = "http://vi.vipr.ebaydesc.com/ws/eBayISAPI.dll?ViewItemDescV4&item=" + ebayno
        # webbrowser.open(url)
        req = urllib.request.Request(url=url, headers=self.randHeader())
        webpage = urllib.request.urlopen(req)
        html = webpage.read()
        soup = BeautifulSoup(html, 'html.parser')  # 解析
        item={}
        item["Built-in signal light:"] = self.getdesc_martins_bold(soup,"Built-in signal light:")
        item["Folding type:"] = self.getdesc_martins_bold(soup,"Folding type:")
        item["Heated:"] = self.getdesc_martins_bold(soup,"Heated:")
        item["Towing mirror:"] = self.getdesc_martins_bold(soup,"Towing mirror:")
        item["Glass operation:"] = self.getdesc_martins_bold(soup,"Glass operation:")
        self.db.save([ebayno, item["Built-in signal light:"],item["Folding type:"],item["Heated:"],item["Towing mirror:"],item["Glass operation:"] ])  # 保存到数据库

    def getdesc_discount_bold(self,soup, bold):
        boldstr = ""
        bold = soup.find("strong", text=bold) # strong + 字符串
        if bold is not None:
            boldstr = bold.find_next().string # 下一个标签
        return boldstr
    def getdesc_discount(self, ebayno):
        ebayno = str(ebayno)
        # 如果数据库中有的数据,直接返回不处理
        if self.db.is_exists_ebayno(ebayno):
            return
        url = "http://vi.vipr.ebaydesc.com/ws/eBayISAPI.dll?ViewItemDescV4&item=" + ebayno
        # webbrowser.open(url)
        req = urllib.request.Request(url=url, headers=self.randHeader())
        webpage = urllib.request.urlopen(req)
        html = webpage.read()
        soup = BeautifulSoup(html, 'html.parser')  # 解析

        item = {}
        item["Built-in signal light"] = self.getdesc_discount_bold(soup, "Built-in signal light")
        item["Folding type"] = self.getdesc_discount_bold(soup, "Folding type")
        item["Heated"] = self.getdesc_discount_bold(soup, "Heated")
        item["Towing mirror"] = self.getdesc_discount_bold(soup, "Towing mirror")
        item["Glass operation"] = self.getdesc_discount_bold(soup, "Glass operation")
        self.db.save(
            [ebayno, item["Built-in signal light"], item["Folding type"], item["Heated"], item["Towing mirror"],item["Glass operation"]])  # 保存到数据库


    def getdesc_deal_bold(self,soup, bold):
        boldstr = ""
        bold = soup.find("strong", text=bold) # strong + 字符串
        if bold is not None:
            boldstr = bold.find_next().string # 下一个标签
        return boldstr
    def getdesc_deal(self, ebayno):
        ebayno = str(ebayno)
        # 如果数据库中有的数据,直接返回不处理
        if self.db.is_exists_ebayno(ebayno):
            return
        url = "http://vi.vipr.ebaydesc.com/ws/eBayISAPI.dll?ViewItemDescV4&item=" + ebayno
        # webbrowser.open(url)
        req = urllib.request.Request(url=url, headers=self.randHeader())
        webpage = urllib.request.urlopen(req)
        html = webpage.read()
        soup = BeautifulSoup(html, 'html.parser')  # 解析

        item = {}
        item["Built-in signal light"] = self.getdesc_discount_bold(soup, "Built-in signal light")
        item["Folding type"] = self.getdesc_discount_bold(soup, "Folding type")
        item["Heated"] = self.getdesc_discount_bold(soup, "Heated")
        item["Towing mirror"] = self.getdesc_discount_bold(soup, "Towing mirror")
        item["Glass operation"] = self.getdesc_discount_bold(soup, "Glass operation")
        self.db.save(
            [ebayno, item["Built-in signal light"], item["Folding type"], item["Heated"], item["Towing mirror"],item["Glass operation"]])  # 保存到数据库

    def getdesc_art(self,ebayno):
        ebayno = str(ebayno)
        # 如果数据库中有的数据,直接返回不处理
        if self.db.is_exists_ebayno(ebayno):
            return
        url = "http://vi.vipr.ebaydesc.com/ws/eBayISAPI.dll?ViewItemDescV4&item=" + ebayno
        # webbrowser.open(url)
        req = urllib.request.Request(url=url, headers=self.randHeader())
        webpage = urllib.request.urlopen(req)
        html = webpage.read()
        soup = BeautifulSoup(html, 'html.parser')  # 解析
        newsoup = soup.prettify()
        left = newsoup.find("Mirror Features:")  # 先字符串定位Mirror 再网页元素抽取
        soup = BeautifulSoup(newsoup[left:], 'html.parser')
        ul = soup.find("ul")
        stringto = ""
        if ul is not None:
            strings = []
            for string in ul.stripped_strings:
                strings.append(string)
            stringto = "|".join(strings)
        # print(stringto)
        self.db.save([ebayno, stringto])  # 保存到数据库
class ThreadCrawl(threading.Thread): #ThreadCrawl类继承了Threading.Thread类

    def __init__(self, queue):  #子类特有属性, queue
        FORMAT = time.strftime("[%Y-%m-%d %H:%M:%S]", time.localtime()) + "[AmazonSpider]-----%(message)s------"
        logging.basicConfig(level=logging.INFO, format=FORMAT)
        threading.Thread.__init__(self)
        self.queue = queue
        self.spider = AmazonSpider()  #子类特有属性spider, 并初始化,将实例用作属性

    def run(self):
        while True:
            success = True
            item = self.queue.get() #调用队列对象的get()方法从队头删除并返回一个项目item
            self.spider.getdesc_art(item)  # 调用实例spider的方法getDataById(item)
            # try:
            #     self.spider.getdesc(item) #调用实例spider的方法getDataById(item)
            # except :
            #     # print("失败")
            #     success = False
            # if not success :
            #     self.queue.put(item)
            logging.info("now queue size is: %d" % self.queue.qsize()) #队列对象qsize()方法,返回队列的大小
            self.queue.task_done() #队列对象在完成一项工作后,向任务已经完成的队列发送一个信号

class SpiderJob():
    def __init__(self , size , qs):
        self.size = size  # 将形参size的值存储到属性变量size中
        self.qs = qs
    def work(self):
        toSpiderQueue = Queue() #创建一个Queue队列对象
        for q in self.qs:
            toSpiderQueue.put(q)  #调用队列对象的put()方法,在对尾插入一个项目item
        for i in range(self.size):
            t = ThreadCrawl(toSpiderQueue)    #将实例用到一个类的方法中
            t.setDaemon(True)
            t.start()
        toSpiderQueue.join()    #队列对象,等到队列为空,再执行别的操作





3主函数


from  ebay_description import SpiderJob #从一个模块中导入类
import pymysql
import pandas as pd
from local_data import  Database
from local_data_carpartswholesale import Database_carpart
from local_data_primechoice import Database_primechoice
from local_data_martins import Database_martins

if __name__ == '__main__':
    #初次跑程序的时候,需要删除旧表,然后新建表,之后重启再跑的时候需要注释
    #----------------------
    # db = Database()
    # db.dropTables()
    # db.createTables()

    # db = Database_carpart()
    # db.dropTables()
    # db.createTables()

    # db = Database_primechoice()
    # db.dropTables()
    # db.createTables()
    # db = Database_martins()
    # db.dropTables()
    # db.createTables()

    #---------------------------
    df = pd.read_excel("ebaynos.xlsx")
    # print(df.info())
    qs = df["ebayno"].values
    # print(qs)
    print(len(qs))
    amazonJob = SpiderJob(8, qs)
    amazonJob.work()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值