从Python爬虫到Spark预处理数据的真实需求[三]

絮叨两句:
博主是一名软件工程系的在校生,利用博客记录自己所学的知识,也希望能帮助到正在学习的同学们
人的一生中会遇到各种各样的困难和折磨,逃避是解决不了问题的,唯有以乐观的精神去迎接生活的挑战
少年易老学难成,一寸光阴不可轻。
最喜欢的一句话:今日事,今日毕


博主刚刚接触爬虫,有什么不足之处请大家谅解,也希望能指导一下

系列文章目录

从Python爬虫到Spark预处理数据的真实需求[一]
从Python爬虫到Spark预处理数据的真实需求[二]
从Python爬虫到Spark预处理数据的真实需求[三]
从Python爬虫到Spark预处理数据的真实需求[四]
从Python爬虫到Spark预处理数据的真实需求[五]



前言

这一章是使用Requests直接获取数据,
获取数据没有什么需要太注意的,不过有一点需要注意一下,
直接请求商品链接,进入商品的详情页面只会他是不返回价格
的,这一点需要在获取商品第二章使用selenium的时候就要把
商品的价格给获取出来


提示:以下是本篇文章正文内容,下面案例可供参考

数据来源是

从Python爬虫到Spark预处理数据的真实需求[二]
是第二章使用selenium获取数据写入到本地
然后读取本地的数据就可以了
例如:
在这里插入图片描述

一个是直接获取的

火花塞

# -*- coding: utf-8 -*-
from  bs4 import BeautifulSoup
import requests
from fake_useragent import UserAgent
import pymysql
import  time as ti
import random
def get_proxy():
     #这是我个人的代理池 
     #在这里就不给大家展示了,若有需要可以尝试自己搭建一个代理池,
     #不懂得可以问我,后面我也会尽快的出一个教程
     return  requests.get('http://xxxxxxxxxxx/get/').json()['proxy']
def getHTML(url):
    proxy = get_proxy()  # 获取代理ip
    ua = UserAgent()  # 实例化
    # 请求头就可以写成
    cookie = '__jdu=577937999; areaId=15; ipLoc-djd=15-1213-3410-0; PCSYCityID=CN_330000_330100_330105; shshshfpa=bbe2e678-8333-005c-d01a-b070738f7860-1597809413; shshshfpb=pqSL0Bsl%2FLma%20U3QU6OB1xw%3D%3D; mt_xid=V2_52007VwcUVFVaVFIXQSldVWJWFwVVX05cGx0eQAAyVhRODQhWWQNJH1gEY1QWBwhcWwovShhfBHsCG05eWUNaG0IcVA5mACJQbVhiUh9IGV4MYgMbU1xfV14eQR1bAVcDFFZZ; user-key=68c44d85-8cac-4072-8369-c117f62d8eb3; cn=0; unpl=V2_ZzNtbURfFhZwXEEAKx4OVWJTElsSUUoUdQsRAHkbWgFmCkEKclRCFnQUR11nGl0UZwQZWEVcQxxFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZH8fWg1lBRpVSmdzEkU4dlN7EFQGZDMTbUNnAUEpCk5Weh5YSGMFFFVAUUsdfThHZHg%3d; __jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_79d24e6ea6ca4a17a78012fe337508bf|1597913171572; __jda=122270672.577937999.1597809411.1597906781.1597909689.7; __jdc=122270672; 3AB9D23F7A4B3C9B=3SJYGJUIMVOSXHMAT54Z7M54MSN7POALYPRYHXXL4OTIUAYWVYTBG6AFPA4L4Q5ED37GELWUAZFAMTA6KV6JQSFCHA; shshshfp=b65beee4eac3565989e568b588a5f619; shshshsID=1d65ff1a1d5a2ecc10479b3f2d1ce72b_39_1597913210747; __jdb=122270672.48.577937999|7.1597909689'
    headers = {"User-Agent": ua.random,
               'Cookie': cookie}
    trytimes = 1000  # 重试的次数

    for i in range(trytimes):
        try:
            response = requests.get(url, headers=headers, proxies={"http": "https://{}".format(proxy)}, timeout=3)
            # response = requests.get(url, headers=headers,timeout=3)
            #	注意此处也可能是302等状态码
            if response.status_code == 200:
                break
        except:
            # logdebug(f'requests failed {i}time')
            print(f'requests failed {i} time','要获取的URL:',url)


    return response.text

def readUrl():
    with open('E:\\url\\HHS\\JD_HHS_URLS.txt','r',encoding='utf-8') as jd:
        jd_lines=jd.readlines()
        for line in jd_lines:
            jd_Href_Name=eval(line.strip())
            href=jd_Href_Name['href_url']
            if str(href).__contains__("ccc-x.jd.com"):
                href=href[len('https:'):]
            brand=jd_Href_Name['bran_name']
            price=jd_Href_Name['price']
            skuId=jd_Href_Name['skuId']
            # print(jd_Href_Name)
            # print('href:',href,'   ','bran-name',brand)
            # # ti.sleep(random.random()*1)
            getProduct(href,brand,skuId,price)
def getProduct(https_li_href,brand_name,product_Sku,product_Price):
    db = "INSERT INTO `xxuan_car_jd_hhs_product` VALUES (NULL,"
    sql = {'skuid': '',
           'name': '',
           'brand': '',
           'price':'',
           'url': '',
           'commodity_Name':'',
           'image':'',
           'sales':'',
           'material': '',
           'type': '',
           'ArticleNumbera': '',
           'GrossWeight': ''
           }
    sql['url']=https_li_href
    sql['brand']=brand_name
    sql['price']=product_Price
    sql['skuid']=product_Sku
    product_HTML = getHTML(https_li_href)
    produc_soup = BeautifulSoup(product_HTML, 'html.parser')
    # 商品标题名称
    sku_name_wrap = produc_soup.find('div', attrs={'class': 'itemInfo-wrap'})
    if sku_name_wrap != None:
        sku_name = sku_name_wrap.find('div', attrs={'class': 'sku-name'})
        if sku_name != None:
            sku_name = sku_name.text
            sku_name = str(sku_name).strip()
            sql['commodity_Name'] = sku_name
    # 商品图片
    spec_img = produc_soup.find('img', attrs={'id': 'spec-img'})
    if spec_img == None:
        spec_img = 'NULL'
    else:
        spec_img = spec_img['data-origin']
    # print("https:",spec_img)
    imageURL = f"https:{spec_img}"
    if imageURL.__contains__('NULL'):
        sql['image'] = f"NULL"
    else:
        sql['image'] = imageURL
    # 商品规格信息
    parameter_list = produc_soup.find('ul', attrs={'class': 'parameter2 p-parameter-list'})
    if parameter_list != None:
        li_all_parameter = parameter_list.findAll('li')
        for li in li_all_parameter:
            # print(li)
            if str(li.text).__contains__('商品名称:'):
                if li.text == None:
                    sql['name'] = 'NULL',
                else:
                    sql['name'] = str(li.text).replace('商品名称:', '')
            elif str(li.text).__contains__('销售规格:'):
                if li.text == None:
                    sql['sales'] = 'NULL',
                else:
                    sql['sales'] = str(li.text).replace('销售规格:', '')
            elif str(li.text).__contains__('产品材质:'):
                if li.text == None:
                    sql['material'] = 'NULL',
                else:
                    sql['material'] = str(li.text).replace('产品材质:', '')
            elif str(li.text).__contains__('产品类型:'):
                if li.text == None:
                    sql['type'] = 'NULL',
                else:
                    sql['type'] = str(li.text).replace('产品类型:', '')
            elif str(li.text).__contains__('货号:'):
                if li.text == None:
                    sql['ArticleNumbera'] = 'NULL',
                else:
                    sql['ArticleNumbera'] = str(li.text).replace('货号:', '')
            elif str(li.text).__contains__('商品毛重:'):
                if li.text == None:
                    sql['GrossWeight'] = 'NULL',
                else:
                    sql['GrossWeight'] = str(li.text).replace('商品毛重:', '')
    # print(sql)
    for i in sql:
        if len(str(sql[i])) == 0:
            sql[i] = 'NULL'
        if i != "GrossWeight":

            db += f"'{sql[i]}',"
        else:
            db += f"'{sql[i]}');"
    # print(db)
    '''
    首先生成插入语句,等写入直接source加载
    '''
    # with open('E:\\xxuan_car_jd_mobil_product.txt', 'a', encoding='utf-8') as w:
    #     w.write(db + '\r')
    #     print(db)
    '''
    直接插入
    '''
    # print(db)
    conneMysql(db)


def conneMysql(sql):
    conn = pymysql.connect(
        host='localhost',
        user='root',
        password='root',
        db='jd_qipei',
        charset='utf8',
        autocommit=True,  # 如果插入数据,, 是否自动提交? 和conn.commit()功能一致。
    )
    cur = conn.cursor()
    try:
        insert_sql = sql
        cur.execute(insert_sql)
    except Exception as e:
        print("插入数据失败:", e)
    else:
        # 如果是插入数据, 一定要提交数据, 不然数据库中找不到要插入的数据;
        conn.commit()
        print("插入数据成功;")

if __name__ == '__main__':
    readUrl()

机油

# -*- coding: utf-8 -*-
from  bs4 import BeautifulSoup
import requests
from fake_useragent import UserAgent
import pymysql
import  time as ti
import random
def get_proxy():
      #这是我个人的代理池 
     #在这里就不给大家展示了,若有需要可以尝试自己搭建一个代理池,
     #不懂得可以问我,后面我也会尽快的出一个教程
     return  requests.get('http://xxxxxxxxxxx/get/').json()['proxy']
def getHTML(url):
    proxy = get_proxy()  # 获取代理ip
    ua = UserAgent()  # 实例化
    # 请求头就可以写成
    cookie = '__jdu=577937999; areaId=15; ipLoc-djd=15-1213-3410-0; PCSYCityID=CN_330000_330100_330105; shshshfpa=bbe2e678-8333-005c-d01a-b070738f7860-1597809413; shshshfpb=pqSL0Bsl%2FLma%20U3QU6OB1xw%3D%3D; mt_xid=V2_52007VwcUVFVaVFIXQSldVWJWFwVVX05cGx0eQAAyVhRODQhWWQNJH1gEY1QWBwhcWwovShhfBHsCG05eWUNaG0IcVA5mACJQbVhiUh9IGV4MYgMbU1xfV14eQR1bAVcDFFZZ; user-key=68c44d85-8cac-4072-8369-c117f62d8eb3; cn=0; unpl=V2_ZzNtbURfFhZwXEEAKx4OVWJTElsSUUoUdQsRAHkbWgFmCkEKclRCFnQUR11nGl0UZwQZWEVcQxxFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZH8fWg1lBRpVSmdzEkU4dlN7EFQGZDMTbUNnAUEpCk5Weh5YSGMFFFVAUUsdfThHZHg%3d; __jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_79d24e6ea6ca4a17a78012fe337508bf|1597913171572; __jda=122270672.577937999.1597809411.1597906781.1597909689.7; __jdc=122270672; 3AB9D23F7A4B3C9B=3SJYGJUIMVOSXHMAT54Z7M54MSN7POALYPRYHXXL4OTIUAYWVYTBG6AFPA4L4Q5ED37GELWUAZFAMTA6KV6JQSFCHA; shshshfp=b65beee4eac3565989e568b588a5f619; shshshsID=1d65ff1a1d5a2ecc10479b3f2d1ce72b_39_1597913210747; __jdb=122270672.48.577937999|7.1597909689'
    headers = {"User-Agent": ua.random,
               'Cookie': cookie}
    trytimes = 1  # 重试的次数

    for i in range(trytimes):
        try:
            response = requests.get(url, headers=headers, proxies={"http": "https://{}".format(proxy)}, timeout=1)
            # response = requests.get(url, headers=headers,timeout=3)
            #	注意此处也可能是302等状态码
            if response.status_code == 200:
                break
        except:
            # logdebug(f'requests failed {i}time')
            print(f'requests failed {i} time','要获取的URL:',url)
            return 'False'


    return response.text

def readUrl():
    with open('D:\\url\\jy\\JD_JY_URLS.txt','r',encoding='utf-8') as jd:
        jd_lines=jd.readlines()
        for line in jd_lines:

            jd_Href_Name=eval(line.strip())
            if not  str(jd_Href_Name['href_url']).__contains__('https://ccc-x.jd.com'):
                href=jd_Href_Name['href_url']
                brand=jd_Href_Name['bran_name']
                # print(jd_Href_Name)
                ti.sleep(random.random()*1)
                getProduct(href,brand)
def getProduct(https_li_href,brand_name):
    db = "INSERT INTO `xxuan_car_jd_mobil_product` VALUES (NULL,"
    sql = {'skuid': '',
           'name': '',
           'brand': '',
           'type': '',
           'url': '',
           'originplace': '',
           'netweight': '',
           'price': '',
           'commodity_Name': '',
           'image': '',
           'viscosity': '',
           'volume': ''
           }
    sql['url']=https_li_href
    sql['brand']=brand_name
    db_HK = f"INSERT INTO `xxuan_car_jd_hk_mobil_product` VALUES ('{https_li_href}','{brand_name}');"
    product_HTML = getHTML(https_li_href)
    if product_HTML=='False':
        conneMysql_HK(db_HK)
        return
    produc_soup = BeautifulSoup(product_HTML, 'html.parser')
    # 商品标题名称
    sku_name_wrap = produc_soup.find('div', attrs={'class': 'itemInfo-wrap'})
    if sku_name_wrap != None:
        sku_name = sku_name_wrap.find('div', attrs={'class': 'sku-name'})
        if sku_name != None:
            sku_name = sku_name.text
            sku_name = str(sku_name).strip()
            sql['commodity_Name'] = sku_name

    # print("商品标题名称:",sku_name)
    # print('商品价格:',li_price)
    summary_price=produc_soup.find('div',attrs={'class':'summary-price J-summary-price'})
    if summary_price!=None:
        p_price=summary_price.find('div',attrs={'class':'dd'}).find('span',attrs={'class':'pricing'})
        if p_price!=None:
            p_price=str(p_price.text).replace('[','').replace(']','').replace('¥','')
        else:
            p_price='NULL'
        sql['price']=p_price
    else:
        sql['price']='NULL'
    # 商品图片
    spec_img = produc_soup.find('img', attrs={'id': 'spec-img'})
    if spec_img == None:
        spec_img = 'NULL'
    else:
        spec_img = spec_img['data-origin']
    # print("https:",spec_img)
    imageURL = f"https:{spec_img}"
    if imageURL.__contains__('NULL'):
        sql['image'] = f"NULL"
    else:
        sql['image'] = imageURL
    # 商品规格信息
    parameter_list = produc_soup.find('ul', attrs={'class': 'parameter2 p-parameter-list'})
    if parameter_list != None:
        li_all_parameter = parameter_list.findAll('li')
        for li in li_all_parameter:
            if str(li.text).__contains__('商品名称:'):
                if li.text == None:
                    sql['name'] = 'NULL',
                else:
                    sql['name'] = str(li.text).replace('商品名称:', '')
            elif str(li.text).__contains__('商品编号:'):
                if li.text == None:
                    sql['skuid'] = 'NULL',
                else:
                    sql['skuid'] =str(li.text).replace('商品编号:', '')
            elif str(li.text).__contains__('商品毛重:'):
                if li.text == None:
                    sql['netweight'] = 'NULL',
                else:
                    sql['netweight'] = str(li.text).replace('商品毛重:', '')
            elif str(li.text).__contains__('商品产地:'):
                if li.text == None:
                    sql['originplace'] = 'NULL',
                else:
                    sql['originplace'] = str(li.text).replace('商品产地:', '')
            elif str(li.text).__contains__('粘度:'):
                if li.text == None:
                    sql['viscosity'] = 'NULL',
                else:
                    sql['viscosity'] = str(li.text).replace('粘度:', '')
            elif str(li.text).__contains__('机油种类:'):
                if li.text == None:
                    sql['type'] = 'NULL',
                else:
                    sql['type'] = str(li.text).replace('机油种类:', '')
            elif str(li.text).__contains__('容量:'):
                if li.text == None:
                    sql['volume'] = 'NULL',
                else:
                    sql['volume'] = str(li.text).replace('容量:', '')
    # print(sql)
    for i in sql:
        if len(str(sql[i])) == 0:
            sql[i] = 'NULL'
        if i != "volume":

            db += f"'{sql[i]}',"
        else:
            db += f"'{sql[i]}');"
    '''
    首先生成插入语句,等写入直接source加载
    '''
    # with open('E:\\xxuan_car_jd_mobil_product.txt', 'a', encoding='utf-8') as w:
    #     w.write(db + '\r')
    #     print(db)
    '''
    直接插入
    '''
    # print(db)
    conneMysql(db)


def conneMysql_HK(sql):
    conn = pymysql.connect(
        host='localhost',
        user='root',
        password='root',
        db='jd_qipei',
        charset='utf8',
        autocommit=True,  # 如果插入数据,, 是否自动提交? 和conn.commit()功能一致。
    )
    cur = conn.cursor()
    try:
        insert_sql = sql
        cur.execute(insert_sql)
    except Exception as e:
        print("插入数据失败:", e)
    else:
        # 如果是插入数据, 一定要提交数据, 不然数据库中找不到要插入的数据;
        conn.commit()
        print("插入数据成功;")
def conneMysql(sql):
    conn = pymysql.connect(
        host='localhost',
        user='root',
        password='root',
        db='jd_qipei',
        charset='utf8',
        autocommit=True,  # 如果插入数据,, 是否自动提交? 和conn.commit()功能一致。
    )
    cur = conn.cursor()
    try:
        insert_sql = sql
        cur.execute(insert_sql)
    except Exception as e:
        print("插入数据失败:", e)
    else:
        # 如果是插入数据, 一定要提交数据, 不然数据库中找不到要插入的数据;
        conn.commit()
        print("插入数据成功;")

if __name__ == '__main__':
    readUrl()

轮胎

# -*- coding: utf-8 -*-
from  bs4 import BeautifulSoup
from multiprocessing import Pool
import requests
from fake_useragent import UserAgent
import pymysql
import  time as ti
import random
def get_proxy():
      #这是我个人的代理池 
     #在这里就不给大家展示了,若有需要可以尝试自己搭建一个代理池,
     #不懂得可以问我,后面我也会尽快的出一个教程
     return  requests.get('http://xxxxxxxxxxx/get/').json()['proxy']
def getHTML(url):
    proxy = get_proxy()  # 获取代理ip
    ua = UserAgent()  # 实例化
    # 请求头就可以写成
    cookie = '__jdu=577937999; areaId=15; ipLoc-djd=15-1213-3410-0; PCSYCityID=CN_330000_330100_330105; shshshfpa=bbe2e678-8333-005c-d01a-b070738f7860-1597809413; shshshfpb=pqSL0Bsl%2FLma%20U3QU6OB1xw%3D%3D; mt_xid=V2_52007VwcUVFVaVFIXQSldVWJWFwVVX05cGx0eQAAyVhRODQhWWQNJH1gEY1QWBwhcWwovShhfBHsCG05eWUNaG0IcVA5mACJQbVhiUh9IGV4MYgMbU1xfV14eQR1bAVcDFFZZ; user-key=68c44d85-8cac-4072-8369-c117f62d8eb3; cn=0; unpl=V2_ZzNtbURfFhZwXEEAKx4OVWJTElsSUUoUdQsRAHkbWgFmCkEKclRCFnQUR11nGl0UZwQZWEVcQxxFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZH8fWg1lBRpVSmdzEkU4dlN7EFQGZDMTbUNnAUEpCk5Weh5YSGMFFFVAUUsdfThHZHg%3d; __jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_79d24e6ea6ca4a17a78012fe337508bf|1597913171572; __jda=122270672.577937999.1597809411.1597906781.1597909689.7; __jdc=122270672; 3AB9D23F7A4B3C9B=3SJYGJUIMVOSXHMAT54Z7M54MSN7POALYPRYHXXL4OTIUAYWVYTBG6AFPA4L4Q5ED37GELWUAZFAMTA6KV6JQSFCHA; shshshfp=b65beee4eac3565989e568b588a5f619; shshshsID=1d65ff1a1d5a2ecc10479b3f2d1ce72b_39_1597913210747; __jdb=122270672.48.577937999|7.1597909689'
    headers = {"User-Agent": ua.random,
               'Cookie': cookie}
    trytimes = 1  # 重试的次数

    for i in range(trytimes):
        try:
            response = requests.get(url, headers=headers, proxies={"http": "https://{}".format(proxy)}, timeout=1)
            # response = requests.get(url, headers=headers,timeout=3)
            #	注意此处也可能是302等状态码
            if response.status_code == 200:
                break
        except:
            # logdebug(f'requests failed {i}time')
            print(f'requests failed {i} time','要获取的URL:',url)
            return 'False'


    return response.text

def readUrl():
    All_URL=[]
    with open('E:\\url\\luntai\\JD_LT_URLS.txt','r',encoding='utf-8') as jd:
        jd_lines=jd.readlines()
        for line in jd_lines:
            jd_Href_Name=eval(line.strip())
            # href=jd_Href_Name['href_url']
            # brand=jd_Href_Name['bran_name']
            # price=jd_Href_Name['price']
            # print(jd_Href_Name)
            All_URL.append(jd_Href_Name)
            # ti.sleep(random.random()*1)
            # getProduct(href,brand,price)
    return All_URL
# def getProduct(https_li_href,brand_name,price):
def getProduct(jd_Href_Name):
    # print(jd_Href_Name)
    ti.sleep(random.random()*2)
    https_li_href=jd_Href_Name['href_url']
    brand_name=jd_Href_Name['bran_name']
    price=jd_Href_Name['price']
    db = "INSERT INTO `xxuan_car_jd_lt_product` VALUES (NULL,"
    sql = {'skuid': '',
           'name': '',
           'brand': '',
           'url': '',
           'price': '',
           'commodity_Name': '',
           'image': '',
           'netweight': '',
           'originplace': '',
           'size': '',
           'width': '',
           'number': '',
           'performance': '',
           'Flattening': '',
           'characteristics':'',
           'type':''
           }
    sql['url']=https_li_href
    sql['brand']=brand_name
    sql['price']=price
    db_HK = f"INSERT INTO `xxuan_car_jd_lt_hk_product` VALUES ('{https_li_href}','{brand_name}');"
    product_HTML = getHTML(https_li_href)
    if product_HTML=='False':
        conneMysql_HK(db_HK)
        return
    produc_soup = BeautifulSoup(product_HTML, 'html.parser')
    # 商品标题名称
    sku_name_wrap = produc_soup.find('div', attrs={'class': 'itemInfo-wrap'})
    if sku_name_wrap != None:
        sku_name = sku_name_wrap.find('div', attrs={'class': 'sku-name'})
        if sku_name != None:
            sku_name = sku_name.text
            sku_name = str(sku_name).strip()
            sql['commodity_Name'] = sku_name

    # print("商品标题名称:",sku_name)
    # print('商品价格:',li_price)
    # 商品图片
    spec_img = produc_soup.find('img', attrs={'id': 'spec-img'})
    if spec_img == None:
        spec_img = 'NULL'
    else:
        spec_img = spec_img['data-origin']
    # print("https:",spec_img)
    imageURL = f"https:{spec_img}"
    if imageURL.__contains__('NULL'):
        sql['image'] = f"NULL"
    else:
        sql['image'] = imageURL
    # 商品规格信息
    parameter_list = produc_soup.find('ul', attrs={'class': 'parameter2 p-parameter-list'})
    if parameter_list != None:
        li_all_parameter = parameter_list.findAll('li')
        for li in li_all_parameter:
            if str(li.text).__contains__('商品名称:'):
                if li.text == None:
                    sql['name'] = 'NULL',
                else:
                    sql['name'] = str(li.text).replace('商品名称:', '')
            elif str(li.text).__contains__('商品编号:'):
                if li.text == None:
                    sql['skuid'] = 'NULL',
                else:
                    sql['skuid'] =str(li.text).replace('商品编号:', '')
            elif str(li.text).__contains__('商品毛重:'):
                if li.text == None:
                    sql['netweight'] = 'NULL',
                else:
                    sql['netweight'] = str(li.text).replace('商品毛重:', '')
            elif str(li.text).__contains__('商品产地:'):
                if li.text == None:
                    sql['originplace'] = 'NULL',
                else:
                    sql['originplace'] = str(li.text).replace('商品产地:', '')
            elif str(li.text).__contains__('尺寸:'):
                if li.text == None:
                    sql['size'] = 'NULL',
                else:
                    sql['size'] = str(li.text).replace('尺寸:', '')
            elif str(li.text).__contains__('胎面宽度:'):
                if li.text == None:
                    sql['width'] = 'NULL',
                else:
                    sql['width'] = str(li.text).replace('胎面宽度:', '')
            elif str(li.text).__contains__('扁平比:'):
                if li.text == None:
                    sql['Flattening'] = 'NULL',
                else:
                    sql['Flattening'] = str(li.text).replace('扁平比:', '')
            elif str(li.text).__contains__('货号:'):
                if li.text == None:
                    sql['number'] = 'NULL',
                else:
                    sql['number'] = str(li.text).replace('货号:', '')
            elif str(li.text).__contains__('花纹性能:'):
                if li.text == None:
                    sql['performance'] = 'NULL',
                else:
                    sql['performance'] = str(li.text).replace('花纹性能:', '')
            elif str(li.text).__contains__('轮胎特性:'):
                if li.text == None:
                    sql['characteristics'] = 'NULL',
                else:
                    sql['characteristics'] = str(li.text).replace('轮胎特性:', '')
            elif str(li.text).__contains__('车型类别:'):
                if li.text == None:
                    sql['type'] = 'NULL',
                else:
                    sql['type'] = str(li.text).replace('车型类别:', '')
    # print(sql)
    for i in sql:
        if len(str(sql[i])) == 0:
            sql[i] = 'NULL'
        if i != "type":

            db += f"'{sql[i]}',"
        else:
            db += f"'{sql[i]}');"
    # print(db)
    conneMysql(db)

def conneMysql_HK(sql):
    conn = pymysql.connect(
        host='localhost',
        user='root',
        password='root',
        db='jd_qipei',
        charset='utf8',
        autocommit=True,  # 如果插入数据,, 是否自动提交? 和conn.commit()功能一致。
    )
    cur = conn.cursor()
    try:
        insert_sql = sql
        cur.execute(insert_sql)
    except Exception as e:
        print("插入数据失败:", e)
    else:
        # 如果是插入数据, 一定要提交数据, 不然数据库中找不到要插入的数据;
        conn.commit()
        print("插入数据成功;")
def conneMysql(sql):
    conn = pymysql.connect(
        host='localhost',
        user='root',
        password='root',
        db='jd_qipei',
        charset='utf8',
        autocommit=True,  # 如果插入数据,, 是否自动提交? 和conn.commit()功能一致。
    )
    cur = conn.cursor()
    try:
        insert_sql = sql
        cur.execute(insert_sql)
    except Exception as e:
        print("插入数据失败:", e)
    else:
        # 如果是插入数据, 一定要提交数据, 不然数据库中找不到要插入的数据;
        conn.commit()
        print("插入数据成功;")

if __name__ == '__main__':
    urls=readUrl()
    pool=Pool(processes=5)
    pool.map(getProduct,urls)

刹车片

# -*- coding: utf-8 -*-
from  bs4 import BeautifulSoup
import requests
from fake_useragent import UserAgent
import pymysql
import  time as ti
import random
def get_proxy():
     #这是我个人的代理池 
     #在这里就不给大家展示了,若有需要可以尝试自己搭建一个代理池,
     #不懂得可以问我,后面我也会尽快的出一个教程
     return  requests.get('http://xxxxxxxxxxx/get/').json()['proxy']
def getHTML(url):
    proxy = get_proxy()  # 获取代理ip
    ua = UserAgent()  # 实例化
    # 请求头就可以写成
    cookie = '__jdu=577937999; areaId=15; ipLoc-djd=15-1213-3410-0; PCSYCityID=CN_330000_330100_330105; shshshfpa=bbe2e678-8333-005c-d01a-b070738f7860-1597809413; shshshfpb=pqSL0Bsl%2FLma%20U3QU6OB1xw%3D%3D; mt_xid=V2_52007VwcUVFVaVFIXQSldVWJWFwVVX05cGx0eQAAyVhRODQhWWQNJH1gEY1QWBwhcWwovShhfBHsCG05eWUNaG0IcVA5mACJQbVhiUh9IGV4MYgMbU1xfV14eQR1bAVcDFFZZ; user-key=68c44d85-8cac-4072-8369-c117f62d8eb3; cn=0; unpl=V2_ZzNtbURfFhZwXEEAKx4OVWJTElsSUUoUdQsRAHkbWgFmCkEKclRCFnQUR11nGl0UZwQZWEVcQxxFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZH8fWg1lBRpVSmdzEkU4dlN7EFQGZDMTbUNnAUEpCk5Weh5YSGMFFFVAUUsdfThHZHg%3d; __jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_79d24e6ea6ca4a17a78012fe337508bf|1597913171572; __jda=122270672.577937999.1597809411.1597906781.1597909689.7; __jdc=122270672; 3AB9D23F7A4B3C9B=3SJYGJUIMVOSXHMAT54Z7M54MSN7POALYPRYHXXL4OTIUAYWVYTBG6AFPA4L4Q5ED37GELWUAZFAMTA6KV6JQSFCHA; shshshfp=b65beee4eac3565989e568b588a5f619; shshshsID=1d65ff1a1d5a2ecc10479b3f2d1ce72b_39_1597913210747; __jdb=122270672.48.577937999|7.1597909689'
    headers = {"User-Agent": ua.random,
               'Cookie': cookie}
    trytimes = 100  # 重试的次数

    for i in range(trytimes):
        try:
            response = requests.get(url, headers=headers, proxies={"http": "https://{}".format(proxy)}, timeout=1)
            # response = requests.get(url, headers=headers,timeout=3)
            #	注意此处也可能是302等状态码
            if response.status_code == 200:
                break
        except:
            # logdebug(f'requests failed {i}time')
            print(f'requests failed {i} time','要获取的URL:',url)


    return response.text

def readUrl():
    with open('E:\\url\\SCP\\JD_SCP_URLS.txt','r',encoding='utf-8') as jd:
        jd_lines=jd.readlines()
        for line in jd_lines:
            jd_Href_Name=eval(line.strip())
            href=jd_Href_Name['href_url']
            brand=jd_Href_Name['bran_name']
            price=jd_Href_Name['price']
            skuId=jd_Href_Name['skuId']
            # print(jd_Href_Name)
            # print('href:',href,'   ','bran-name',brand)
            # ti.sleep(random.random()*1)
            getProduct(href,brand,skuId,price)
def getProduct(https_li_href,brand_name,product_Sku,product_Price):
    db = "INSERT INTO `xxuan_car_jd_scp_product` VALUES (NULL,"
    sql = {'skuid': '',
           'name': '',
           'brand': '',
           'price':'',
           'url': '',
           'commodity_Name':'',
           'image':'',
           'Additivetype':'',
           'TypesOfAdditives':'',
           'NetContent':'',
           'ArticleNumber':'',
           'boiling':'',
           'package':'',
           'GrossWeight':'',
           'CommodityOrigin':'',
           'process':'',
           'Installation':'',
           'type':'',
           'texture':''
           }
    sql['url']=https_li_href
    sql['brand']=brand_name
    sql['price']=product_Price
    sql['skuid']=product_Sku
    product_HTML = getHTML(https_li_href)
    produc_soup = BeautifulSoup(product_HTML, 'html.parser')
    # 商品标题名称
    sku_name_wrap = produc_soup.find('div', attrs={'class': 'itemInfo-wrap'})
    if sku_name_wrap != None:
        sku_name = sku_name_wrap.find('div', attrs={'class': 'sku-name'})
        if sku_name != None:
            sku_name = sku_name.text
            sku_name = str(sku_name).strip()
            sql['commodity_Name'] = sku_name
    # 商品图片
    spec_img = produc_soup.find('img', attrs={'id': 'spec-img'})
    if spec_img == None:
        spec_img = 'NULL'
    else:
        spec_img = spec_img['data-origin']
    # print("https:",spec_img)
    imageURL = f"https:{spec_img}"
    if imageURL.__contains__('NULL'):
        sql['image'] = f"NULL"
    else:
        sql['image'] = imageURL
    # 商品规格信息
    parameter_list = produc_soup.find('ul', attrs={'class': 'parameter2 p-parameter-list'})
    if parameter_list != None:
        li_all_parameter = parameter_list.findAll('li')
        for li in li_all_parameter:
            # print(li)
            if str(li.text).__contains__('商品名称:'):
                if li.text == None:
                    sql['name'] = 'NULL',
                else:
                    sql['name'] = str(li.text).replace('商品名称:', '')
            elif str(li.text).__contains__('商品编号:'):
                if li.text == None:
                    sql['skuid'] = 'NULL',
                    # pass
                else:
                    sql['skuid'] =str(li.text).replace('商品编号:', '')
                    # pass
            elif str(li.text).__contains__('产品类别:'):
                if li.text == None:
                    sql['type'] = 'NULL',
                else:
                    sql['type'] = str(li.text).replace('产品类别:', '')
            elif str(li.text).__contains__('包装规格:'):
                if li.text == None:
                    sql['package'] = 'NULL',
                else:
                    sql['package'] = str(li.text).replace('包装规格:', '')
            elif str(li.text).__contains__('干湿沸点:'):
                if li.text == None:
                    sql['boiling'] = 'NULL',
                else:
                    sql['boiling'] = str(li.text).replace('干湿沸点:', '')
            elif str(li.text).__contains__('货号:'):
                if li.text == None:
                    sql['ArticleNumber'] = 'NULL',
                else:
                    sql['ArticleNumber'] = str(li.text).replace('货号:', '')
            elif str(li.text).__contains__('商品毛重:'):
                if li.text == None:
                    sql['GrossWeight'] = 'NULL',
                else:
                    sql['GrossWeight'] = str(li.text).replace('商品毛重:', '')
            elif str(li.text).__contains__('商品产地:'):
                if li.text == None:
                    sql['CommodityOrigin'] = 'NULL',
                else:
                    sql['CommodityOrigin'] = str(li.text).replace('商品产地:', '')
            elif str(li.text).__contains__('产品工艺:'):
                if li.text == None:
                    sql['process'] = 'NULL',
                else:
                    sql['process'] = str(li.text).replace('产品工艺:', '')
            elif str(li.text).__contains__('安装位置:'):
                if li.text == None:
                    sql['Installation'] = 'NULL',
                else:
                    sql['Installation'] = str(li.text).replace('安装位置:', '')
            elif str(li.text).__contains__('类别:'):
                if li.text == None:
                    sql['type'] = 'NULL',
                else:
                    sql['type'] = str(li.text).replace('类别:', '')
            elif str(li.text).__contains__('材质:'):
                if li.text == None:
                    sql['texture'] = 'NULL',
                else:
                    sql['texture'] = str(li.text).replace('材质:', '')
    # print(sql)
    for i in sql:
        if len(str(sql[i])) == 0:
            sql[i] = 'NULL'
        if i != "texture":

            db += f"'{sql[i]}',"
        else:
            db += f"'{sql[i]}');"
    # print(db)
    '''
    首先生成插入语句,等写入直接source加载
    '''
    # with open('E:\\xxuan_car_jd_mobil_product.txt', 'a', encoding='utf-8') as w:
    #     w.write(db + '\r')
    #     print(db)
    '''
    直接插入
    '''
    # print(db)
    conneMysql(db)


def conneMysql(sql):
    conn = pymysql.connect(
        host='localhost',
        user='root',
        password='root',
        db='jd_qipei',
        charset='utf8',
        autocommit=True,  # 如果插入数据,, 是否自动提交? 和conn.commit()功能一致。
    )
    cur = conn.cursor()
    try:
        insert_sql = sql
        cur.execute(insert_sql)
    except Exception as e:
        print("插入数据失败:", e)
    else:
        # 如果是插入数据, 一定要提交数据, 不然数据库中找不到要插入的数据;
        conn.commit()
        print("插入数据成功;")

if __name__ == '__main__':
    readUrl()

添加剂

# -*- coding: utf-8 -*-
from  bs4 import BeautifulSoup
import requests
from fake_useragent import UserAgent
import pymysql
import  time as ti
import random
def get_proxy():
     #这是我个人的代理池 
     #在这里就不给大家展示了,若有需要可以尝试自己搭建一个代理池,
     #不懂得可以问我,后面我也会尽快的出一个教程
     return  requests.get('http://xxxxxxxxxxx/get/').json()['proxy']
def getHTML(url):
    proxy = get_proxy()  # 获取代理ip
    ua = UserAgent()  # 实例化
    # 请求头就可以写成
    cookie = '__jdu=577937999; areaId=15; ipLoc-djd=15-1213-3410-0; PCSYCityID=CN_330000_330100_330105; shshshfpa=bbe2e678-8333-005c-d01a-b070738f7860-1597809413; shshshfpb=pqSL0Bsl%2FLma%20U3QU6OB1xw%3D%3D; mt_xid=V2_52007VwcUVFVaVFIXQSldVWJWFwVVX05cGx0eQAAyVhRODQhWWQNJH1gEY1QWBwhcWwovShhfBHsCG05eWUNaG0IcVA5mACJQbVhiUh9IGV4MYgMbU1xfV14eQR1bAVcDFFZZ; user-key=68c44d85-8cac-4072-8369-c117f62d8eb3; cn=0; unpl=V2_ZzNtbURfFhZwXEEAKx4OVWJTElsSUUoUdQsRAHkbWgFmCkEKclRCFnQUR11nGl0UZwQZWEVcQxxFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZH8fWg1lBRpVSmdzEkU4dlN7EFQGZDMTbUNnAUEpCk5Weh5YSGMFFFVAUUsdfThHZHg%3d; __jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_79d24e6ea6ca4a17a78012fe337508bf|1597913171572; __jda=122270672.577937999.1597809411.1597906781.1597909689.7; __jdc=122270672; 3AB9D23F7A4B3C9B=3SJYGJUIMVOSXHMAT54Z7M54MSN7POALYPRYHXXL4OTIUAYWVYTBG6AFPA4L4Q5ED37GELWUAZFAMTA6KV6JQSFCHA; shshshfp=b65beee4eac3565989e568b588a5f619; shshshsID=1d65ff1a1d5a2ecc10479b3f2d1ce72b_39_1597913210747; __jdb=122270672.48.577937999|7.1597909689'
    headers = {"User-Agent": ua.random,
               'Cookie': cookie}
    trytimes = 100  # 重试的次数

    for i in range(trytimes):
        try:
            response = requests.get(url, headers=headers, proxies={"http": "https://{}".format(proxy)}, timeout=1)
            # response = requests.get(url, headers=headers,timeout=3)
            #	注意此处也可能是302等状态码
            if response.status_code == 200:
                break
        except:
            # logdebug(f'requests failed {i}time')
            print(f'requests failed {i} time','要获取的URL:',url)


    return response.text

def readUrl():
    with open('E:\\url\\tjj\\JD_TJJ_URLS.txt','r',encoding='utf-8') as jd:
        jd_lines=jd.readlines()
        for line in jd_lines:
            jd_Href_Name=eval(line.strip())
            href=jd_Href_Name['href_url']
            brand=jd_Href_Name['bran_name']
            price=jd_Href_Name['price']
            # skuId=jd_Href_Name['skuId']
            # print(jd_Href_Name)
            # print('href:',href,'   ','bran-name',brand)
            # ti.sleep(random.random()*1)
            getProduct(href,brand,price)
def getProduct(https_li_href,brand_name,product_Price):
    db = "INSERT INTO `xxuan_car_jd_tjj_product` VALUES (NULL,"
    sql = {'skuid': '',
           'name': '',
           'brand': '',
           'price':'',
           'url': '',
           'commodity_Name':'',
           'image':'',
           'Additivetype':'',
           'TypesOfAdditives':'',
           'NetContent':'',
           'ArticleNumber':'',
           'GrossWeight':'',
           'CommodityOrigin':''
           }
    sql['url']=https_li_href
    sql['brand']=brand_name
    sql['price']=product_Price
    product_HTML = getHTML(https_li_href)
    produc_soup = BeautifulSoup(product_HTML, 'html.parser')
    # 商品标题名称
    sku_name_wrap = produc_soup.find('div', attrs={'class': 'itemInfo-wrap'})
    if sku_name_wrap != None:
        sku_name = sku_name_wrap.find('div', attrs={'class': 'sku-name'})
        if sku_name != None:
            sku_name = sku_name.text
            sku_name = str(sku_name).strip()
            sql['commodity_Name'] = sku_name
    # 商品图片
    spec_img = produc_soup.find('img', attrs={'id': 'spec-img'})
    if spec_img == None:
        spec_img = 'NULL'
    else:
        spec_img = spec_img['data-origin']
    # print("https:",spec_img)
    imageURL = f"https:{spec_img}"
    if imageURL.__contains__('NULL'):
        sql['image'] = f"NULL"
    else:
        sql['image'] = imageURL
    # 商品规格信息
    parameter_list = produc_soup.find('ul', attrs={'class': 'parameter2 p-parameter-list'})
    if parameter_list != None:
        li_all_parameter = parameter_list.findAll('li')
        for li in li_all_parameter:
            # print(li)
            if str(li.text).__contains__('商品名称:'):
                if li.text == None:
                    sql['name'] = 'NULL',
                else:
                    sql['name'] = str(li.text).replace('商品名称:', '')
            elif str(li.text).__contains__('商品编号:'):
                if li.text == None:
                    sql['skuid'] = 'NULL',
                    # pass
                else:
                    sql['skuid'] =str(li.text).replace('商品编号:', '')
                    # pass
            elif str(li.text).__contains__('添加剂类型:'):
                if li.text == None:
                    sql['Additivetype'] = 'NULL',
                else:
                    sql['Additivetype'] = str(li.text).replace('添加剂类型:', '')
            elif str(li.text).__contains__('添加剂种类:'):
                if li.text == None:
                    sql['TypesOfAdditives'] = 'NULL',
                else:
                    sql['TypesOfAdditives'] = str(li.text).replace('添加剂种类:', '')
            elif str(li.text).__contains__('净含量:'):
                if li.text == None:
                    sql['NetContent'] = 'NULL',
                else:
                    sql['NetContent'] = str(li.text).replace('净含量:', '')
            elif str(li.text).__contains__('货号:'):
                if li.text == None:
                    sql['ArticleNumber'] = 'NULL',
                else:
                    sql['ArticleNumber'] = str(li.text).replace('货号:', '')
            elif str(li.text).__contains__('商品毛重:'):
                if li.text == None:
                    sql['GrossWeight'] = 'NULL',
                else:
                    sql['GrossWeight'] = str(li.text).replace('商品毛重:', '')
            elif str(li.text).__contains__('商品产地:'):
                if li.text == None:
                    sql['CommodityOrigin'] = 'NULL',
                else:
                    sql['CommodityOrigin'] = str(li.text).replace('商品产地:', '')
    # print(sql)
    for i in sql:
        if len(str(sql[i])) == 0:
            sql[i] = 'NULL'
        if i != "CommodityOrigin":

            db += f"'{sql[i]}',"
        else:
            db += f"'{sql[i]}');"
    # print(db)
    '''
    首先生成插入语句,等写入直接source加载
    '''
    # with open('E:\\xxuan_car_jd_mobil_product.txt', 'a', encoding='utf-8') as w:
    #     w.write(db + '\r')
    #     print(db)
    '''
    直接插入
    '''
    # print(db)
    conneMysql(db)


def conneMysql(sql):
    conn = pymysql.connect(
        host='localhost',
        user='root',
        password='root',
        db='jd_qipei',
        charset='utf8',
        autocommit=True,  # 如果插入数据,, 是否自动提交? 和conn.commit()功能一致。
    )
    cur = conn.cursor()
    try:
        insert_sql = sql
        cur.execute(insert_sql)
    except Exception as e:
        print("插入数据失败:", e)
    else:
        # 如果是插入数据, 一定要提交数据, 不然数据库中找不到要插入的数据;
        conn.commit()
        print("插入数据成功;")

if __name__ == '__main__':
    readUrl()

原厂件

# -*- coding: utf-8 -*-
from  bs4 import BeautifulSoup
import requests
from fake_useragent import UserAgent
import pymysql
import  time as ti
import random
def get_proxy():
     #这是我个人的代理池 
     #在这里就不给大家展示了,若有需要可以尝试自己搭建一个代理池,
     #不懂得可以问我,后面我也会尽快的出一个教程
     return  requests.get('http://xxxxxxxxxxx/get/').json()['proxy']
def getHTML(url):
    proxy = get_proxy()  # 获取代理ip
    ua = UserAgent()  # 实例化
    # 请求头就可以写成
    cookie = '__jdu=577937999; areaId=15; ipLoc-djd=15-1213-3410-0; PCSYCityID=CN_330000_330100_330105; shshshfpa=bbe2e678-8333-005c-d01a-b070738f7860-1597809413; shshshfpb=pqSL0Bsl%2FLma%20U3QU6OB1xw%3D%3D; mt_xid=V2_52007VwcUVFVaVFIXQSldVWJWFwVVX05cGx0eQAAyVhRODQhWWQNJH1gEY1QWBwhcWwovShhfBHsCG05eWUNaG0IcVA5mACJQbVhiUh9IGV4MYgMbU1xfV14eQR1bAVcDFFZZ; user-key=68c44d85-8cac-4072-8369-c117f62d8eb3; cn=0; unpl=V2_ZzNtbURfFhZwXEEAKx4OVWJTElsSUUoUdQsRAHkbWgFmCkEKclRCFnQUR11nGl0UZwQZWEVcQxxFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZH8fWg1lBRpVSmdzEkU4dlN7EFQGZDMTbUNnAUEpCk5Weh5YSGMFFFVAUUsdfThHZHg%3d; __jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_79d24e6ea6ca4a17a78012fe337508bf|1597913171572; __jda=122270672.577937999.1597809411.1597906781.1597909689.7; __jdc=122270672; 3AB9D23F7A4B3C9B=3SJYGJUIMVOSXHMAT54Z7M54MSN7POALYPRYHXXL4OTIUAYWVYTBG6AFPA4L4Q5ED37GELWUAZFAMTA6KV6JQSFCHA; shshshfp=b65beee4eac3565989e568b588a5f619; shshshsID=1d65ff1a1d5a2ecc10479b3f2d1ce72b_39_1597913210747; __jdb=122270672.48.577937999|7.1597909689'
    headers = {"User-Agent": ua.random,
               'Cookie': cookie}
    trytimes = 100  # 重试的次数

    for i in range(trytimes):
        try:
            response = requests.get(url, headers=headers, proxies={"http": "https://{}".format(proxy)}, timeout=1)
            # response = requests.get(url, headers=headers,timeout=3)
            #	注意此处也可能是302等状态码
            if response.status_code == 200:
                break
        except:
            # logdebug(f'requests failed {i}time')
            print(f'requests failed {i} time','要获取的URL:',url)


    return response.text

def readUrl():
    with open('E:\\url\\YCJ\\JD_YCJ_URLS.txt','r',encoding='utf-8') as jd:
        jd_lines=jd.readlines()
        for line in jd_lines:
            jd_Href_Name=eval(line.strip())
            href=jd_Href_Name['href_url']
            brand=jd_Href_Name['bran_name']
            price=jd_Href_Name['price']
            skuId=jd_Href_Name['skuId']
            # print(jd_Href_Name)
            # print('href:',href,'   ','bran-name',brand)
            # ti.sleep(random.random()*1)
            getProduct(href,brand,skuId,price)
def getProduct(https_li_href,brand_name,product_sku,product_Price):
    db = "INSERT INTO `xxuan_car_jd_ycj_product` VALUES (NULL,"
    sql = {'skuid': '',
           'name': '',
           'brand': '',
           'freezing': '',
           'url': '',
           'originplace': '',
           'netweight': '',
           'price': '',
           'commodity_Name': '',
           'image': '',
           'category': '',
           'package':'',
           'boiling':'',
           'sales':'',
           'installation':'',
           'transmission':''
           }
    sql['url']=https_li_href
    sql['brand']=brand_name
    sql['skuid']=product_sku
    sql['price']=product_Price
    product_HTML = getHTML(https_li_href)
    produc_soup = BeautifulSoup(product_HTML, 'html.parser')
    # 商品标题名称
    sku_name_wrap = produc_soup.find('div', attrs={'class': 'itemInfo-wrap'})
    if sku_name_wrap != None:
        sku_name = sku_name_wrap.find('div', attrs={'class': 'sku-name'})
        if sku_name != None:
            sku_name = sku_name.text
            sku_name = str(sku_name).strip()
            sql['commodity_Name'] = sku_name
    # 商品图片
    spec_img = produc_soup.find('img', attrs={'id': 'spec-img'})
    if spec_img == None:
        spec_img = 'NULL'
    else:
        spec_img = spec_img['data-origin']
    # print("https:",spec_img)
    imageURL = f"https:{spec_img}"
    if imageURL.__contains__('NULL'):
        sql['image'] = f"NULL"
    else:
        sql['image'] = imageURL
    # 商品规格信息
    parameter_list = produc_soup.find('ul', attrs={'class': 'parameter2 p-parameter-list'})
    if parameter_list != None:
        li_all_parameter = parameter_list.findAll('li')
        for li in li_all_parameter:
            # print(li)
            if str(li.text).__contains__('商品名称:'):
                if li.text == None:
                    sql['name'] = 'NULL',
                else:
                    sql['name'] = str(li.text).replace('商品名称:', '')
            elif str(li.text).__contains__('商品编号:'):
                if li.text == None:
                    # sql['skuid'] = 'NULL',
                    pass
                else:
                    # sql['skuid'] =str(li.text).replace('商品编号:', '')
                    pass
            elif str(li.text).__contains__('商品毛重:'):
                if li.text == None:
                    sql['netweight'] = 'NULL',
                else:
                    sql['netweight'] = str(li.text).replace('商品毛重:', '')
            elif str(li.text).__contains__('商品产地:'):
                if li.text == None:
                    sql['originplace'] = 'NULL',
                else:
                    sql['originplace'] = str(li.text).replace('商品产地:', '')
            elif str(li.text).__contains__('产品类别:'):
                if li.text == None:
                    sql['category'] = 'NULL',
                else:
                    sql['category'] = str(li.text).replace('产品类别:', '')
            elif str(li.text).__contains__('冰点:'):
                if li.text == None:
                    sql['freezing'] = 'NULL',
                else:
                    sql['freezing'] = str(li.text).replace('冰点:', '')
            elif str(li.text).__contains__('包装规格:'):
                if li.text == None:
                    sql['package'] = 'NULL',
                else:
                    sql['package'] = str(li.text).replace('包装规格:', '')
            elif str(li.text).__contains__('干湿沸点:'):
                if li.text == None:
                    sql['boiling'] = 'NULL',
                else:
                    sql['boiling'] = str(li.text).replace('干湿沸点:', '')
            elif str(li.text).__contains__('销售规格:'):
                if li.text == None:
                    sql['sales'] = 'NULL',
                else:
                    sql['sales'] = str(li.text).replace('销售规格:', '')
            elif str(li.text).__contains__('安装位置:'):
                if li.text == None:
                    sql['installation'] = 'NULL',
                else:
                    sql['installation'] = str(li.text).replace('安装位置:', '')
            elif str(li.text).__contains__('变速箱类型:'):
                if li.text == None:
                    sql['transmission'] = 'NULL',
                else:
                    sql['transmission'] = str(li.text).replace('变速箱类型:', '')
    # print(sql)
    for i in sql:
        if len(str(sql[i])) == 0:
            sql[i] = 'NULL'
        if i != "transmission":

            db += f"'{sql[i]}',"
        else:
            db += f"'{sql[i]}');"
    # print(db)
    '''
    首先生成插入语句,等写入直接source加载
    '''
    # with open('E:\\xxuan_car_jd_mobil_product.txt', 'a', encoding='utf-8') as w:
    #     w.write(db + '\r')
    #     print(db)
    '''
    直接插入
    '''
    # print(db)
    conneMysql(db)


def conneMysql(sql):
    conn = pymysql.connect(
        host='localhost',
        user='root',
        password='root',
        db='jd_qipei',
        charset='utf8',
        autocommit=True,  # 如果插入数据,, 是否自动提交? 和conn.commit()功能一致。
    )
    cur = conn.cursor()
    try:
        insert_sql = sql
        cur.execute(insert_sql)
    except Exception as e:
        print("插入数据失败:", e)
    else:
        # 如果是插入数据, 一定要提交数据, 不然数据库中找不到要插入的数据;
        conn.commit()
        print("插入数据成功;")

if __name__ == '__main__':
    readUrl()

使用多进程请求获取数据

这里就展示一个代码,基本都一样

火花塞

# -*- coding: utf-8 -*-
from  bs4 import BeautifulSoup
import requests
from multiprocessing import Pool
from fake_useragent import UserAgent
import pymysql
import  time as ti
import random
def get_proxy():
    return  requests.get('http://xxxxxxxxxxx/get/').json()['proxy']
def getHTML(url):
    proxy = get_proxy()  # 获取代理ip
    ua = UserAgent()  # 实例化
    # 请求头就可以写成
    cookie = '__jdu=577937999; areaId=15; ipLoc-djd=15-1213-3410-0; PCSYCityID=CN_330000_330100_330105; shshshfpa=bbe2e678-8333-005c-d01a-b070738f7860-1597809413; shshshfpb=pqSL0Bsl%2FLma%20U3QU6OB1xw%3D%3D; mt_xid=V2_52007VwcUVFVaVFIXQSldVWJWFwVVX05cGx0eQAAyVhRODQhWWQNJH1gEY1QWBwhcWwovShhfBHsCG05eWUNaG0IcVA5mACJQbVhiUh9IGV4MYgMbU1xfV14eQR1bAVcDFFZZ; user-key=68c44d85-8cac-4072-8369-c117f62d8eb3; cn=0; unpl=V2_ZzNtbURfFhZwXEEAKx4OVWJTElsSUUoUdQsRAHkbWgFmCkEKclRCFnQUR11nGl0UZwQZWEVcQxxFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZH8fWg1lBRpVSmdzEkU4dlN7EFQGZDMTbUNnAUEpCk5Weh5YSGMFFFVAUUsdfThHZHg%3d; __jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_79d24e6ea6ca4a17a78012fe337508bf|1597913171572; __jda=122270672.577937999.1597809411.1597906781.1597909689.7; __jdc=122270672; 3AB9D23F7A4B3C9B=3SJYGJUIMVOSXHMAT54Z7M54MSN7POALYPRYHXXL4OTIUAYWVYTBG6AFPA4L4Q5ED37GELWUAZFAMTA6KV6JQSFCHA; shshshfp=b65beee4eac3565989e568b588a5f619; shshshsID=1d65ff1a1d5a2ecc10479b3f2d1ce72b_39_1597913210747; __jdb=122270672.48.577937999|7.1597909689'
    headers = {"User-Agent": ua.random,
               'Cookie': cookie}
    trytimes = 1000  # 重试的次数

    for i in range(trytimes):
        try:
            response = requests.get(url, headers=headers, proxies={"http": "https://{}".format(proxy)}, timeout=3)
            # response = requests.get(url, headers=headers,timeout=3)
            #	注意此处也可能是302等状态码
            if response.status_code == 200:
                break
        except:
            # logdebug(f'requests failed {i}time')
            print(f'requests failed {i} time','要获取的URL:',url)


    return response.text

def readUrl():
    ALL_URL=[]
    with open('E:\\url\\HHS\\JD_HHS_URLS.txt','r',encoding='utf-8') as jd:
        jd_lines=jd.readlines()
        for line in jd_lines:
            jd_Href_Name=eval(line.strip())

            ALL_URL.append(jd_Href_Name)
    return ALL_URL
def getProduct(jd_Href_Name):
    https_li_href = jd_Href_Name['href_url']
    if str(https_li_href).__contains__("ccc-x.jd.com"):
        https_li_href = https_li_href[len('https:'):]
    brand_name = jd_Href_Name['bran_name']
    product_Price = jd_Href_Name['price']
    product_Sku = jd_Href_Name['skuId']
    db = "INSERT INTO `xxuan_car_jd_hhs_product` VALUES (NULL,"
    sql = {'skuid': '',
           'name': '',
           'brand': '',
           'price':'',
           'url': '',
           'commodity_Name':'',
           'image':'',
           'sales':'',
           'material': '',
           'type': '',
           'ArticleNumber': '',
           'GrossWeight': ''
           }
    sql['url']=https_li_href
    sql['brand']=brand_name
    sql['price']=product_Price
    sql['skuid']=product_Sku
    product_HTML = getHTML(https_li_href)
    produc_soup = BeautifulSoup(product_HTML, 'html.parser')
    # 商品标题名称
    sku_name_wrap = produc_soup.find('div', attrs={'class': 'itemInfo-wrap'})
    if sku_name_wrap != None:
        sku_name = sku_name_wrap.find('div', attrs={'class': 'sku-name'})
        if sku_name != None:
            sku_name = sku_name.text
            sku_name = str(sku_name).strip()
            sql['commodity_Name'] = sku_name
    # 商品图片
    spec_img = produc_soup.find('img', attrs={'id': 'spec-img'})
    if spec_img == None:
        spec_img = 'NULL'
    else:
        spec_img = spec_img['data-origin']
    # print("https:",spec_img)
    imageURL = f"https:{spec_img}"
    if imageURL.__contains__('NULL'):
        sql['image'] = f"NULL"
    else:
        sql['image'] = imageURL
    # 商品规格信息
    parameter_list = produc_soup.find('ul', attrs={'class': 'parameter2 p-parameter-list'})
    if parameter_list != None:
        li_all_parameter = parameter_list.findAll('li')
        for li in li_all_parameter:
            # print(li)
            if str(li.text).__contains__('商品名称:'):
                if li.text == None:
                    sql['name'] = 'NULL',
                else:
                    sql['name'] = str(li.text).replace('商品名称:', '')
            elif str(li.text).__contains__('销售规格:'):
                if li.text == None:
                    sql['sales'] = 'NULL',
                else:
                    sql['sales'] = str(li.text).replace('销售规格:', '')
            elif str(li.text).__contains__('产品材质:'):
                if li.text == None:
                    sql['material'] = 'NULL',
                else:
                    sql['material'] = str(li.text).replace('产品材质:', '')
            elif str(li.text).__contains__('产品类型:'):
                if li.text == None:
                    sql['type'] = 'NULL',
                else:
                    sql['type'] = str(li.text).replace('产品类型:', '')
            elif str(li.text).__contains__('货号:'):
                if li.text == None:
                    sql['ArticleNumber'] = 'NULL',
                else:
                    sql['ArticleNumber'] = str(li.text).replace('货号:', '')
            elif str(li.text).__contains__('商品毛重:'):
                if li.text == None:
                    sql['GrossWeight'] = 'NULL',
                else:
                    sql['GrossWeight'] = str(li.text).replace('商品毛重:', '')
    for i in sql:
        if len(str(sql[i])) == 0:
            sql[i] = 'NULL'
        if i != "GrossWeight":

            db += f"'{sql[i]}',"
        else:
            db += f"'{sql[i]}');"
    conneMysql(db)


def conneMysql(sql):
    conn = pymysql.connect(
        host='localhost',
        user='root',
        password='root',
        db='jd_qipei',
        charset='utf8',
        autocommit=True,  # 如果插入数据,, 是否自动提交? 和conn.commit()功能一致。
    )
    cur = conn.cursor()
    try:
        insert_sql = sql
        cur.execute(insert_sql)
    except Exception as e:
        print("插入数据失败:", e)
    else:
        # 如果是插入数据, 一定要提交数据, 不然数据库中找不到要插入的数据;
        conn.commit()
        print("插入数据成功;")

if __name__ == '__main__':
    urls = readUrl()
    pool = Pool(processes=10)
    pool.map(getProduct, urls)


总结

这篇文章使用两种方法进行有效率的获取数据 希望能帮助到大家,谢谢
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

weixin_43563705

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值