ebay fitment json解析

方式1


s = re.findall("\((.*)\)", s)[0]

将代码里的相应的正则表达式替换成这个



import logging
import random
import threading
import urllib.parse
import urllib.parse
import urllib.request
from queue import Queue
import pymysql
from bs4 import BeautifulSoup
import time
import  re
import csv
import json

class Spider():
    def randHeader(self):
        head_connection = ['Keep-Alive', 'close']
        head_accept = ['text/html, application/xhtml+xml, */*']
        head_accept_language = ['zh-CN,fr-FR;q=0.5', 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3']
        head_user_agent = ['Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
                           'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
                           'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
                           'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
                           'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
                           'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
                           'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
                           'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
                           'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
                           'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
                           'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
                           'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
                           'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
                           'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
                           'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
                           'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
                           'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
                           'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
                           'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']

        header = {
            'Connection': head_connection[0],
            'Accept': head_accept[0],
            'Accept-Language': head_accept_language[1],
            'User-Agent': head_user_agent[random.randrange(0, len(head_user_agent))]
        }
        return header


    def getBeautifulSoup(self,ebayno):
        url_1 = 'http://frame.ebay.com/ws/eBayISAPI.dll?GetFitmentData&rand=147945705603&site=100&vs=0&req=2&cid=33706&item=' + str(
            ebayno).strip() + '&ct=20&pn=FitmentComments%7CYear%7CMake%7CModel%7CTrim%7CEngine&page=1000&cb=jQuery1709105713909136433_1479456959821&_=1479457056036'

        req = urllib.request.Request(url= url_1 , headers=self.randHeader())
        webpage = urllib.request.urlopen(req)
        html = webpage.read()
        soup = BeautifulSoup(html, 'html.parser')
        return  soup

    def getFitment(self,ebayno):
        print(ebayno)
        out = open("result\\"+str(ebayno)+".csv", "w",newline="",encoding="utf-8")
        csv_writer = csv.writer(out)
        csv_writer.writerow(['fitmentcomment','year','make','model','trim','engine','ebayno'])
        soup = self.getBeautifulSoup(ebayno)
        s = str(soup)
        #s = re.findall("\((.*)\)", s)[0]
        s = re.findall("\((.*)\)", s)[0]
        sjson = json.loads(s)
        data = sjson["data"]
        if data is None:
            with open("result\\fitment_empty.txt","a")as f:
                f.write(ebayno+"\n")
            return
        for i in range(len(data)):
            row = data[i]
            if "Trim" in row:
                Trim = row["Trim"][0]
            else:
                Trim = "All"
            if "FitmentComments" in row:
                FitmentComments = row["FitmentComments"][0]
            else:
                FitmentComments = "All"
            if "Engine" in row:
                Engine = row["Engine"][0]
            else:
                Engine = "All"
            Year = row["Year"][0]
            Make = row["Make"][0]
            Model = row["Model"][0]
            csv_writer.writerow([FitmentComments,  Year, Make,  Model, Trim , Engine, ebayno])

class ThreadCrawl(threading.Thread): #ThreadCrawl类继承了Threading.Thread类

    def __init__(self, queue):  #子类特有属性, queue
        FORMAT = time.strftime("[%Y-%m-%d %H:%M:%S]", time.localtime()) + "[Spider]-----%(message)s------"
        logging.basicConfig(level=logging.INFO, format=FORMAT)
        threading.Thread.__init__(self)
        self.queue = queue
        self.spider = Spider()  #子类特有属性spider, 并初始化,将实例用作属性

    def run(self):
        while True:
            success = True
            item = self.queue.get() #调用队列对象的get()方法从队头删除并返回一个项目item
            try:
                self.spider.getFitment(item) #调用实例spider的方法getDataById(item)
            except :
                success = False
            if not success :
                self.queue.put(item)
            logging.info("now queue size is: %d" % self.queue.qsize()) #队列对象qsize()方法,返回队列的大小
            self.queue.task_done() #队列对象在完成一项工作后,向任务已经完成的队列发送一个信号

class SpiderJob():

    def __init__(self , size , qs):
        self.size = size  # 将形参size的值存储到属性变量size中
        self.qs = qs

    def work(self):
        toSpiderQueue = Queue() #创建一个Queue队列对象
        for i in range(self.size):
            t = ThreadCrawl(toSpiderQueue)    #将实例用到一个类的方法中
            t.setDaemon(True)
            t.start()
        for q in self.qs:
            toSpiderQueue.put(q)  #调用队列对象的put()方法,在对尾插入一个项目item
        toSpiderQueue.join()    #队列对象,等到队列为空,再执行别的操作









2


 

from  fitment_ebay import SpiderJob #从一个模块中导入类
import csv
if __name__ == '__main__':
    qs = []
    with open("test.txt") as fp:
        for line in fp:
            qs.append(str(line.strip()))

    Job = SpiderJob(8, qs[0:100])
    Job.work()
    out = open("result\\together.csv","w",newline="",encoding="utf-8")
    csv_writer = csv.writer(out)
    csv_writer.writerow(['fitmentcomment', 'year', 'make', 'model', 'trim', 'engine', 'ebayno'])
    n = len(qs)
    for i in range(n):
        with open("result\\"+str(qs[i])+".csv", newline="",encoding="utf-8") as f:
            reader = csv.reader(f)
            k = 0
            for row1 in reader:
                if k == 0:  # 去掉列名
                    k = 1
                    continue
                if len(row1) == 0: #排除最后一项为空
                    break
                csv_writer.writerow(row1)

方式二:

"""
使用须知:
代码中数据表名 fitment ,需要更改该数据表名称的注意更改 fitment

"""

import random
import urllib
from http.cookiejar import CookieJar
import requests
from bs4 import BeautifulSoup
import csv
import numpy as np
import xlrd
import os
import json
from queue import Queue
import time
import random
import threading
import logging
import pandas as pd
import pymysql



class Database():
    def __init__(self):
        self.tablename = "fitment" # 可以设置表名
        self.host = "localhost"
        self.user = "root"
        self.password ="123456"
        self.database="ebay"
        self.charset = "utf8"
        self.connect = pymysql.connect(host = self.host, user = self.user,password = self.password, database = self.database, charset =  self.charset)
        self.cursor = self.connect.cursor()
    #删表
    def dropTables(self):
        sql = 'drop table if exists '+self.tablename
        self.cursor.execute(sql)
        print("删表")
    #建表
    def createTables(self):
        sql = 'create table if not exists '+ self.tablename+ ''' 
        ( 
            id int(11) primary key auto_increment, 
            ebayno varchar(100) not null, 
            note varchar(5000) ,
            year varchar(100),
            make varchar(100),
            model varchar(100),
            trim varchar(1000),
            engine varchar(1000)
        ) 
        '''
        self.cursor.execute(sql)
        print("建表")
    #判断是否存在ebayno
    def is_exists_ebayno(self,ebayno):
        sql = 'select * from '+self.tablename + ' where ebayno = %s'
        self.cursor.execute(sql,ebayno)
        if self.cursor.fetchone() is None:
            return False
        return True
    #保存数据
    def save(self,ebayno,note,year,make,model,trim,engine):
        sql = 'insert into '+self.tablename+' (ebayno,note,year,make,model,trim,engine) values (%s,%s,%s,%s,%s,%s,%s)'
        self.cursor.execute(sql,(ebayno,note,year,make,model,trim,engine))
        self.connect.commit()


class EbaySpider(object):
    def __init__(self):
        self.db = Database()
        self.SESSION = requests.session()
        self.SESSION.cookies = CookieJar()
        self.HEAD = self.randHeader()

    def randHeader(self):
        head_connection = ['Keep-Alive', 'close']
        head_accept = ['text/html, application/xhtml+xml, */*']
        head_accept_language = ['zh-CN,fr-FR;q=0.5', 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3']
        head_user_agent = ['Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
                           'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
                           'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
                           'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
                           'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
                           'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
                           'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
                           'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
                           'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
                           'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
                           'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
                           'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
                           'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
                           'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
                           'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
                           'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
                           'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
                           'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
                           'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']
        header = {
            'Connection': head_connection[0],
            'Accept': head_accept[0],
            'Accept-Language': head_accept_language[1],
            'User-Agent': head_user_agent[random.randrange(0, len(head_user_agent))]
        }
        return header

    def get_fitment(self,ebayno):
        ebayno = str(ebayno)
        if self.db.is_exists_ebayno(ebayno):
            return

        # http://developer.ebay.com/devzone/shopping/docs/callref/getsingleitem.html
        # Compatibility 	Description 	Details 	ItemSpecifics 	ShippingCosts 	TextDescription 	Variations
        url = "http://open.api.ebay.com/shopping?callname=GetSingleItem&responseencoding=JSON&" \
              "appid=yourappid&siteid=100&version=967&" \
              "IncludeSelector=Compatibility&" \
              "ItemID=" + ebayno
        r = requests.get(url=url,headers = self.HEAD)
        items = json.loads(r.text)
        item={}
        try:
            fitmentcount = items["Item"]["ItemCompatibilityCount"]
        except:
            print("fitmentcount异常")   # end
            self.db.save(ebayno,item.setdefault("note",None), item.setdefault("Year",None),item.setdefault("Make",None),item.setdefault("Model",None),item.setdefault("Trim",None),item.setdefault("Engine",None))
            return
        fitment=repr(items["Item"]["ItemCompatibilityList"])
        fitment = str(fitment).replace("'",'"')
        try:
            jsonstr = json.loads(fitment)
        except:
            print("json解析错误")
            return
        for i in jsonstr["Compatibility"]:
            value = i["NameValueList"]
            item = {"note": i["CompatibilityNotes"]}
            for i in value:
                try:
                    item[i["Name"]] = i["Value"][0]
                except:
                    continue
            self.db.save(ebayno,item.setdefault("note",None), item.setdefault("Year",None),item.setdefault("Make",None),item.setdefault("Model",None),item.setdefault("Trim",None),item.setdefault("Engine",None))

class ThreadCrawl(threading.Thread): #ThreadCrawl类继承了Threading.Thread类

    def __init__(self, queue):  #子类特有属性, queue
        FORMAT = time.strftime("[%Y-%m-%d %H:%M:%S]", time.localtime()) + "[AmazonSpider]-----%(message)s------"
        logging.basicConfig(level=logging.INFO, format=FORMAT)
        threading.Thread.__init__(self)
        self.queue = queue
        self.spider = EbaySpider()  #子类特有属性spider, 并初始化,将实例用作属性

    def run(self):
        while True:
            success = True
            item = self.queue.get() #调用队列对象的get()方法从队头删除并返回一个项目item
            self.spider.get_fitment(item)  # 调用实例spider的方法getDataById(item)
            logging.info("now queue size is: %d" % self.queue.qsize()) #队列对象qsize()方法,返回队列的大小
            self.queue.task_done() #队列对象在完成一项工作后,向任务已经完成的队列发送一个信号

class EbaySpiderJob():

    def __init__(self , size , qs ):
        self.size = size  # 将形参size的值存储到属性变量size中
        self.qs = qs

    def work(self):
        toSpiderQueue = Queue() #创建一个Queue队列对象
        for i in range(self.size):
            t = ThreadCrawl(toSpiderQueue)    #将实例用到一个类的方法中
            t.setDaemon(True)
            t.start()
        for q in self.qs:
            toSpiderQueue.put(q)  #调用队列对象的put()方法,在对尾插入一个项目item
        toSpiderQueue.join()    #队列对象,等到队列为空,再执行别的操作

if __name__ == '__main__':
  ###  测试
    # spider = EbaySpider()
    # spider.get_fitment("371809250769")
    # exit()

    # ####创建数据库, 停止后接着跑就需要注释
    # db = Database()
    # db.dropTables()
    # db.createTables()
    # exit()
    ####创建数据库 end
    #
    df = pd.read_excel("ebaynos_new.xlsx")
    qs = df["ebayno"].values
    amazonJob = EbaySpiderJob(16, qs)
    amazonJob.work()




方式三:   网址只想确定哪一个 make model year


"""
2017-11-01
1 :在抓取fitment时,抓取api地址为
        url_1 = 'https://frame.ebay.com/ebaymotors/ws/eBayISAPI.dll?GetFitmentData&rand=1509417472792&site=100&vs=0&req=2&item=' + str(ebayno) + '&ct=100&pn=&page=200'
        &ct每页展示100
        &page第几页,当页码大于最大页时,可展示全部
如果只想抓取部分 Year Make Model 可通过&sfp=设置
        url_1 = 'https://frame.ebay.com/ebaymotors/ws/eBayISAPI.dll?GetFitmentData&rand=1509417472792&site=100&vs=0&req=2&sfp=Make%253A' + make + '%257CModel%253A' + model + '&item=' + str(ebayno) + '&ct=100&pn=&page=200'
2: 通过网址,响应的状态可能分三种情况:
"status":{"id":1,"name":"SUCCESS"}   该状态为正常SUCCESS,显示json格式的数据 ,data = sjson["data"]
"status":{"id":0,"name":"FAILURE"},   非正常Failure,"data":[],
"status":null  非正常null "data":null

3:对于某些特殊的ebayno  271595213456 191749891947 当非正常网页出现3次,就停止对该ebayno的抓取
"""



import logging
import random
import threading
import urllib.parse
import urllib.parse
import requests
from queue import Queue
import pymysql
from bs4 import BeautifulSoup
import time
import  re
import csv
import json
import webbrowser
import pandas as pd
import os
from http.cookiejar import CookieJar
class Spider():
    def __init__(self):
        self.SESSION = requests.session()
        self.SESSION.cookies = CookieJar()
        self.HEAD = self.randHeader()
        self.Count = {}
    def randHeader(self):
        head_connection = ['Keep-Alive', 'close']
        head_accept = ['text/html, application/xhtml+xml, */*']
        head_accept_language = ['zh-CN,fr-FR;q=0.5', 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3']
        head_user_agent = ['Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
                           'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
                           'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
                           'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
                           'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
                           'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
                           'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
                           'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
                           'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
                           'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
                           'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
                           'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
                           'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
                           'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
                           'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
                           'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
                           'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
                           'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
                           'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']

        header = {
            'Connection': head_connection[0],
            'Accept': head_accept[0],
            'Accept-Language': head_accept_language[1],
            'User-Agent': head_user_agent[random.randrange(0, len(head_user_agent))]
        }
        return header



    def getFitment(self, item):
        make = str(item[0])
        model_original = str(item[1])
        ebayno = str(item[2])
        restr = " ".join([make, model_original, ebayno])
        if os.path.exists("ebaynos\\"+restr+".xlsx"):
            return
        if ebayno not in self.Count.keys():
            self.Count[ebayno]=1
        else:
            if self.Count[ebayno] == 3:  #对超过3次的ebayno停止抓取
                return
            self.Count[ebayno]+=1
        print(ebayno)
        model = str(model_original).replace("&","%26")
        url_1 = 'https://frame.ebay.com/ebaymotors/ws/eBayISAPI.dll?GetFitmentData&rand=1509417472792&site=100&vs=0&req=2&sfp=Make%253A' + make + '%257CModel%253A' + model + '&item=' + str(ebayno) + '&ct=100&pn=&page=200'
        # webbrowser.open(url_1)
        # webbrowser.open("https://www.ebay.com/itm/"+str(ebayno))
        # print(url_1)
        r = self.SESSION.get(url=url_1, headers=self.HEAD)
        # r = requests.get(url_1,headers = self.randHeader())
        soup = BeautifulSoup(r.text , "html.parser")
        # print("soup")
        # print(soup)
        s = str(soup)
        sjson = json.loads(s)
        data = sjson["data"]
        # if data is None:
        #     with open("fitment_empty.txt", "a")as f:
        #         f.write(ebayno + "\n")
        #     return
        alen = len(data) #data status null,报错,重新加入队列
        # if data is None:
        #     with open("fitment_empty.txt", "a")as f:
        #         f.write(ebayno + "\n")
        #     return
        Years = []
        for i in range(len(data)):
            row = data[i]
            Year = row["Year"][0]
            Years.append(Year)
        y_start = min(Years) # data status [],报错,重新加入队列
        y_end = max(Years)
        temp = pd.DataFrame({"make":[make],"model":[model_original],"ebayno":[ebayno],"ebay_start":[y_start],"ebay_end":[y_end]}, columns=["make","model","ebayno","ebay_start","ebay_end"])
        restr = " ".join([make,model_original,ebayno])
        temp.to_excel("ebaynos\\"+restr+".xlsx", index=False)




class ThreadCrawl(threading.Thread): #ThreadCrawl类继承了Threading.Thread类

    def __init__(self, queue):  #子类特有属性, queue
        FORMAT = time.strftime("[%Y-%m-%d %H:%M:%S]", time.localtime()) + "[Spider]-----%(message)s------"
        logging.basicConfig(level=logging.INFO, format=FORMAT)
        threading.Thread.__init__(self)
        self.queue = queue
        self.spider = Spider()  #子类特有属性spider, 并初始化,将实例用作属性

    def run(self):
        while True:
            success = True
            item = self.queue.get() #调用队列对象的get()方法从队头删除并返回一个项目item
            # self.spider.getFitment(item)  # 调用实例spider的方法getDataById(item)
            try:
                self.spider.getFitment(item) #调用实例spider的方法getDataById(item)
            except :
                success = False
            if not success :
                self.queue.put(item)
            logging.info("now queue size is: %d" % self.queue.qsize()) #队列对象qsize()方法,返回队列的大小
            self.queue.task_done() #队列对象在完成一项工作后,向任务已经完成的队列发送一个信号

class SpiderJob():

    def __init__(self , size , qs):
        self.size = size  # 将形参size的值存储到属性变量size中
        self.qs = qs

    def work(self):
        toSpiderQueue = Queue() #创建一个Queue队列对象
        for i in range(self.size):
            t = ThreadCrawl(toSpiderQueue)    #将实例用到一个类的方法中
            t.setDaemon(True)
            t.start()
        for q in self.qs:
            toSpiderQueue.put(q)  #调用队列对象的put()方法,在对尾插入一个项目item
        toSpiderQueue.join()    #队列对象,等到队列为空,再执行别的操作









  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
要筛选出price在某个范围的数据,并将筛选后的结果生成新的表并保存为.csv文件,你可以按照以下步骤进行操作: 1. 执行筛选语句:首先,使用HiveQL语句执行筛选操作,选择满足条件的数据行。 例如,假设你要筛选出price在1000000到2000000之间的数据,可以使用以下语句: ```sql SELECT * FROM house WHERE price >= 1000000 AND price <= 2000000; ``` 2. 创建新表:使用CREATE TABLE语句创建一个新的表,定义表的结构和列的数据类型与原表相同。 ```sql CREATE TABLE new_table ( title string, address string, house_type string, acreage string, direction string, fitment string, floor_a string, age string, attation string, other string, price int, unit_price string, url string ); ``` 3. 插入数据:使用INSERT INTO语句将筛选后的数据插入到新创建的表中。 ```sql INSERT INTO new_table SELECT * FROM house WHERE price >= 1000000 AND price <= 2000000; ``` 4. 将新表导出为.csv文件:使用HiveQL中的INSERT OVERWRITE LOCAL DIRECTORY语句将新表的数据导出为.csv文件。 ```sql INSERT OVERWRITE LOCAL DIRECTORY '/path/to/output' ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' SELECT * FROM new_table; ``` 在上述语句中,将输出目录路径替换为你要保存.csv文件的实际路径。 执行以上步骤后,将会生成一个新的表并将筛选后的数据插入其中,最后将新表的数据导出为.csv文件到指定的路径中。你可以在指定的路径中找到生成的.csv文件。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值