絮叨两句:
博主是一名软件工程系的在校生,利用博客记录自己所学的知识,也希望能帮助到正在学习的同学们
人的一生中会遇到各种各样的困难和折磨,逃避是解决不了问题的,唯有以乐观的精神去迎接生活的挑战
少年易老学难成,一寸光阴不可轻。
最喜欢的一句话:今日事,今日毕
博主刚刚接触爬虫,有什么不足之处请大家谅解,也希望能指导一下
系列文章目录
从Python爬虫到Spark预处理数据的真实需求[一]
从Python爬虫到Spark预处理数据的真实需求[二]
从Python爬虫到Spark预处理数据的真实需求[三]
从Python爬虫到Spark预处理数据的真实需求[四]
从Python爬虫到Spark预处理数据的真实需求[五]
前言
这一章是使用Requests直接获取数据,
获取数据没有什么需要太注意的,不过有一点需要注意一下,
直接请求商品链接,进入商品的详情页面只会他是不返回价格
的,这一点需要在获取商品第二章使用selenium的时候就要把
商品的价格给获取出来
提示:以下是本篇文章正文内容,下面案例可供参考
数据来源是
从Python爬虫到Spark预处理数据的真实需求[二]
是第二章使用selenium获取数据写入到本地
然后读取本地的数据就可以了
例如:
一个是直接获取的
火花塞
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import requests
from fake_useragent import UserAgent
import pymysql
import time as ti
import random
def get_proxy():
#这是我个人的代理池
#在这里就不给大家展示了,若有需要可以尝试自己搭建一个代理池,
#不懂得可以问我,后面我也会尽快的出一个教程
return requests.get('http://xxxxxxxxxxx/get/').json()['proxy']
def getHTML(url):
proxy = get_proxy() # 获取代理ip
ua = UserAgent() # 实例化
# 请求头就可以写成
cookie = '__jdu=577937999; areaId=15; ipLoc-djd=15-1213-3410-0; PCSYCityID=CN_330000_330100_330105; shshshfpa=bbe2e678-8333-005c-d01a-b070738f7860-1597809413; shshshfpb=pqSL0Bsl%2FLma%20U3QU6OB1xw%3D%3D; mt_xid=V2_52007VwcUVFVaVFIXQSldVWJWFwVVX05cGx0eQAAyVhRODQhWWQNJH1gEY1QWBwhcWwovShhfBHsCG05eWUNaG0IcVA5mACJQbVhiUh9IGV4MYgMbU1xfV14eQR1bAVcDFFZZ; user-key=68c44d85-8cac-4072-8369-c117f62d8eb3; cn=0; unpl=V2_ZzNtbURfFhZwXEEAKx4OVWJTElsSUUoUdQsRAHkbWgFmCkEKclRCFnQUR11nGl0UZwQZWEVcQxxFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZH8fWg1lBRpVSmdzEkU4dlN7EFQGZDMTbUNnAUEpCk5Weh5YSGMFFFVAUUsdfThHZHg%3d; __jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_79d24e6ea6ca4a17a78012fe337508bf|1597913171572; __jda=122270672.577937999.1597809411.1597906781.1597909689.7; __jdc=122270672; 3AB9D23F7A4B3C9B=3SJYGJUIMVOSXHMAT54Z7M54MSN7POALYPRYHXXL4OTIUAYWVYTBG6AFPA4L4Q5ED37GELWUAZFAMTA6KV6JQSFCHA; shshshfp=b65beee4eac3565989e568b588a5f619; shshshsID=1d65ff1a1d5a2ecc10479b3f2d1ce72b_39_1597913210747; __jdb=122270672.48.577937999|7.1597909689'
headers = {"User-Agent": ua.random,
'Cookie': cookie}
trytimes = 1000 # 重试的次数
for i in range(trytimes):
try:
response = requests.get(url, headers=headers, proxies={"http": "https://{}".format(proxy)}, timeout=3)
# response = requests.get(url, headers=headers,timeout=3)
# 注意此处也可能是302等状态码
if response.status_code == 200:
break
except:
# logdebug(f'requests failed {i}time')
print(f'requests failed {i} time','要获取的URL:',url)
return response.text
def readUrl():
with open('E:\\url\\HHS\\JD_HHS_URLS.txt','r',encoding='utf-8') as jd:
jd_lines=jd.readlines()
for line in jd_lines:
jd_Href_Name=eval(line.strip())
href=jd_Href_Name['href_url']
if str(href).__contains__("ccc-x.jd.com"):
href=href[len('https:'):]
brand=jd_Href_Name['bran_name']
price=jd_Href_Name['price']
skuId=jd_Href_Name['skuId']
# print(jd_Href_Name)
# print('href:',href,' ','bran-name',brand)
# # ti.sleep(random.random()*1)
getProduct(href,brand,skuId,price)
def getProduct(https_li_href,brand_name,product_Sku,product_Price):
db = "INSERT INTO `xxuan_car_jd_hhs_product` VALUES (NULL,"
sql = {'skuid': '',
'name': '',
'brand': '',
'price':'',
'url': '',
'commodity_Name':'',
'image':'',
'sales':'',
'material': '',
'type': '',
'ArticleNumbera': '',
'GrossWeight': ''
}
sql['url']=https_li_href
sql['brand']=brand_name
sql['price']=product_Price
sql['skuid']=product_Sku
product_HTML = getHTML(https_li_href)
produc_soup = BeautifulSoup(product_HTML, 'html.parser')
# 商品标题名称
sku_name_wrap = produc_soup.find('div', attrs={'class': 'itemInfo-wrap'})
if sku_name_wrap != None:
sku_name = sku_name_wrap.find('div', attrs={'class': 'sku-name'})
if sku_name != None:
sku_name = sku_name.text
sku_name = str(sku_name).strip()
sql['commodity_Name'] = sku_name
# 商品图片
spec_img = produc_soup.find('img', attrs={'id': 'spec-img'})
if spec_img == None:
spec_img = 'NULL'
else:
spec_img = spec_img['data-origin']
# print("https:",spec_img)
imageURL = f"https:{spec_img}"
if imageURL.__contains__('NULL'):
sql['image'] = f"NULL"
else:
sql['image'] = imageURL
# 商品规格信息
parameter_list = produc_soup.find('ul', attrs={'class': 'parameter2 p-parameter-list'})
if parameter_list != None:
li_all_parameter = parameter_list.findAll('li')
for li in li_all_parameter:
# print(li)
if str(li.text).__contains__('商品名称:'):
if li.text == None:
sql['name'] = 'NULL',
else:
sql['name'] = str(li.text).replace('商品名称:', '')
elif str(li.text).__contains__('销售规格:'):
if li.text == None:
sql['sales'] = 'NULL',
else:
sql['sales'] = str(li.text).replace('销售规格:', '')
elif str(li.text).__contains__('产品材质:'):
if li.text == None:
sql['material'] = 'NULL',
else:
sql['material'] = str(li.text).replace('产品材质:', '')
elif str(li.text).__contains__('产品类型:'):
if li.text == None:
sql['type'] = 'NULL',
else:
sql['type'] = str(li.text).replace('产品类型:', '')
elif str(li.text).__contains__('货号:'):
if li.text == None:
sql['ArticleNumbera'] = 'NULL',
else:
sql['ArticleNumbera'] = str(li.text).replace('货号:', '')
elif str(li.text).__contains__('商品毛重:'):
if li.text == None:
sql['GrossWeight'] = 'NULL',
else:
sql['GrossWeight'] = str(li.text).replace('商品毛重:', '')
# print(sql)
for i in sql:
if len(str(sql[i])) == 0:
sql[i] = 'NULL'
if i != "GrossWeight":
db += f"'{sql[i]}',"
else:
db += f"'{sql[i]}');"
# print(db)
'''
首先生成插入语句,等写入直接source加载
'''
# with open('E:\\xxuan_car_jd_mobil_product.txt', 'a', encoding='utf-8') as w:
# w.write(db + '\r')
# print(db)
'''
直接插入
'''
# print(db)
conneMysql(db)
def conneMysql(sql):
conn = pymysql.connect(
host='localhost',
user='root',
password='root',
db='jd_qipei',
charset='utf8',
autocommit=True, # 如果插入数据,, 是否自动提交? 和conn.commit()功能一致。
)
cur = conn.cursor()
try:
insert_sql = sql
cur.execute(insert_sql)
except Exception as e:
print("插入数据失败:", e)
else:
# 如果是插入数据, 一定要提交数据, 不然数据库中找不到要插入的数据;
conn.commit()
print("插入数据成功;")
if __name__ == '__main__':
readUrl()
机油
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import requests
from fake_useragent import UserAgent
import pymysql
import time as ti
import random
def get_proxy():
#这是我个人的代理池
#在这里就不给大家展示了,若有需要可以尝试自己搭建一个代理池,
#不懂得可以问我,后面我也会尽快的出一个教程
return requests.get('http://xxxxxxxxxxx/get/').json()['proxy']
def getHTML(url):
proxy = get_proxy() # 获取代理ip
ua = UserAgent() # 实例化
# 请求头就可以写成
cookie = '__jdu=577937999; areaId=15; ipLoc-djd=15-1213-3410-0; PCSYCityID=CN_330000_330100_330105; shshshfpa=bbe2e678-8333-005c-d01a-b070738f7860-1597809413; shshshfpb=pqSL0Bsl%2FLma%20U3QU6OB1xw%3D%3D; mt_xid=V2_52007VwcUVFVaVFIXQSldVWJWFwVVX05cGx0eQAAyVhRODQhWWQNJH1gEY1QWBwhcWwovShhfBHsCG05eWUNaG0IcVA5mACJQbVhiUh9IGV4MYgMbU1xfV14eQR1bAVcDFFZZ; user-key=68c44d85-8cac-4072-8369-c117f62d8eb3; cn=0; unpl=V2_ZzNtbURfFhZwXEEAKx4OVWJTElsSUUoUdQsRAHkbWgFmCkEKclRCFnQUR11nGl0UZwQZWEVcQxxFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZH8fWg1lBRpVSmdzEkU4dlN7EFQGZDMTbUNnAUEpCk5Weh5YSGMFFFVAUUsdfThHZHg%3d; __jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_79d24e6ea6ca4a17a78012fe337508bf|1597913171572; __jda=122270672.577937999.1597809411.1597906781.1597909689.7; __jdc=122270672; 3AB9D23F7A4B3C9B=3SJYGJUIMVOSXHMAT54Z7M54MSN7POALYPRYHXXL4OTIUAYWVYTBG6AFPA4L4Q5ED37GELWUAZFAMTA6KV6JQSFCHA; shshshfp=b65beee4eac3565989e568b588a5f619; shshshsID=1d65ff1a1d5a2ecc10479b3f2d1ce72b_39_1597913210747; __jdb=122270672.48.577937999|7.1597909689'
headers = {"User-Agent": ua.random,
'Cookie': cookie}
trytimes = 1 # 重试的次数
for i in range(trytimes):
try:
response = requests.get(url, headers=headers, proxies={"http": "https://{}".format(proxy)}, timeout=1)
# response = requests.get(url, headers=headers,timeout=3)
# 注意此处也可能是302等状态码
if response.status_code == 200:
break
except:
# logdebug(f'requests failed {i}time')
print(f'requests failed {i} time','要获取的URL:',url)
return 'False'
return response.text
def readUrl():
with open('D:\\url\\jy\\JD_JY_URLS.txt','r',encoding='utf-8') as jd:
jd_lines=jd.readlines()
for line in jd_lines:
jd_Href_Name=eval(line.strip())
if not str(jd_Href_Name['href_url']).__contains__('https://ccc-x.jd.com'):
href=jd_Href_Name['href_url']
brand=jd_Href_Name['bran_name']
# print(jd_Href_Name)
ti.sleep(random.random()*1)
getProduct(href,brand)
def getProduct(https_li_href,brand_name):
db = "INSERT INTO `xxuan_car_jd_mobil_product` VALUES (NULL,"
sql = {'skuid': '',
'name': '',
'brand': '',
'type': '',
'url': '',
'originplace': '',
'netweight': '',
'price': '',
'commodity_Name': '',
'image': '',
'viscosity': '',
'volume': ''
}
sql['url']=https_li_href
sql['brand']=brand_name
db_HK = f"INSERT INTO `xxuan_car_jd_hk_mobil_product` VALUES ('{https_li_href}','{brand_name}');"
product_HTML = getHTML(https_li_href)
if product_HTML=='False':
conneMysql_HK(db_HK)
return
produc_soup = BeautifulSoup(product_HTML, 'html.parser')
# 商品标题名称
sku_name_wrap = produc_soup.find('div', attrs={'class': 'itemInfo-wrap'})
if sku_name_wrap != None:
sku_name = sku_name_wrap.find('div', attrs={'class': 'sku-name'})
if sku_name != None:
sku_name = sku_name.text
sku_name = str(sku_name).strip()
sql['commodity_Name'] = sku_name
# print("商品标题名称:",sku_name)
# print('商品价格:',li_price)
summary_price=produc_soup.find('div',attrs={'class':'summary-price J-summary-price'})
if summary_price!=None:
p_price=summary_price.find('div',attrs={'class':'dd'}).find('span',attrs={'class':'pricing'})
if p_price!=None:
p_price=str(p_price.text).replace('[','').replace(']','').replace('¥','')
else:
p_price='NULL'
sql['price']=p_price
else:
sql['price']='NULL'
# 商品图片
spec_img = produc_soup.find('img', attrs={'id': 'spec-img'})
if spec_img == None:
spec_img = 'NULL'
else:
spec_img = spec_img['data-origin']
# print("https:",spec_img)
imageURL = f"https:{spec_img}"
if imageURL.__contains__('NULL'):
sql['image'] = f"NULL"
else:
sql['image'] = imageURL
# 商品规格信息
parameter_list = produc_soup.find('ul', attrs={'class': 'parameter2 p-parameter-list'})
if parameter_list != None:
li_all_parameter = parameter_list.findAll('li')
for li in li_all_parameter:
if str(li.text).__contains__('商品名称:'):
if li.text == None:
sql['name'] = 'NULL',
else:
sql['name'] = str(li.text).replace('商品名称:', '')
elif str(li.text).__contains__('商品编号:'):
if li.text == None:
sql['skuid'] = 'NULL',
else:
sql['skuid'] =str(li.text).replace('商品编号:', '')
elif str(li.text).__contains__('商品毛重:'):
if li.text == None:
sql['netweight'] = 'NULL',
else:
sql['netweight'] = str(li.text).replace('商品毛重:', '')
elif str(li.text).__contains__('商品产地:'):
if li.text == None:
sql['originplace'] = 'NULL',
else:
sql['originplace'] = str(li.text).replace('商品产地:', '')
elif str(li.text).__contains__('粘度:'):
if li.text == None:
sql['viscosity'] = 'NULL',
else:
sql['viscosity'] = str(li.text).replace('粘度:', '')
elif str(li.text).__contains__('机油种类:'):
if li.text == None:
sql['type'] = 'NULL',
else:
sql['type'] = str(li.text).replace('机油种类:', '')
elif str(li.text).__contains__('容量:'):
if li.text == None:
sql['volume'] = 'NULL',
else:
sql['volume'] = str(li.text).replace('容量:', '')
# print(sql)
for i in sql:
if len(str(sql[i])) == 0:
sql[i] = 'NULL'
if i != "volume":
db += f"'{sql[i]}',"
else:
db += f"'{sql[i]}');"
'''
首先生成插入语句,等写入直接source加载
'''
# with open('E:\\xxuan_car_jd_mobil_product.txt', 'a', encoding='utf-8') as w:
# w.write(db + '\r')
# print(db)
'''
直接插入
'''
# print(db)
conneMysql(db)
def conneMysql_HK(sql):
conn = pymysql.connect(
host='localhost',
user='root',
password='root',
db='jd_qipei',
charset='utf8',
autocommit=True, # 如果插入数据,, 是否自动提交? 和conn.commit()功能一致。
)
cur = conn.cursor()
try:
insert_sql = sql
cur.execute(insert_sql)
except Exception as e:
print("插入数据失败:", e)
else:
# 如果是插入数据, 一定要提交数据, 不然数据库中找不到要插入的数据;
conn.commit()
print("插入数据成功;")
def conneMysql(sql):
conn = pymysql.connect(
host='localhost',
user='root',
password='root',
db='jd_qipei',
charset='utf8',
autocommit=True, # 如果插入数据,, 是否自动提交? 和conn.commit()功能一致。
)
cur = conn.cursor()
try:
insert_sql = sql
cur.execute(insert_sql)
except Exception as e:
print("插入数据失败:", e)
else:
# 如果是插入数据, 一定要提交数据, 不然数据库中找不到要插入的数据;
conn.commit()
print("插入数据成功;")
if __name__ == '__main__':
readUrl()
轮胎
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
from multiprocessing import Pool
import requests
from fake_useragent import UserAgent
import pymysql
import time as ti
import random
def get_proxy():
#这是我个人的代理池
#在这里就不给大家展示了,若有需要可以尝试自己搭建一个代理池,
#不懂得可以问我,后面我也会尽快的出一个教程
return requests.get('http://xxxxxxxxxxx/get/').json()['proxy']
def getHTML(url):
proxy = get_proxy() # 获取代理ip
ua = UserAgent() # 实例化
# 请求头就可以写成
cookie = '__jdu=577937999; areaId=15; ipLoc-djd=15-1213-3410-0; PCSYCityID=CN_330000_330100_330105; shshshfpa=bbe2e678-8333-005c-d01a-b070738f7860-1597809413; shshshfpb=pqSL0Bsl%2FLma%20U3QU6OB1xw%3D%3D; mt_xid=V2_52007VwcUVFVaVFIXQSldVWJWFwVVX05cGx0eQAAyVhRODQhWWQNJH1gEY1QWBwhcWwovShhfBHsCG05eWUNaG0IcVA5mACJQbVhiUh9IGV4MYgMbU1xfV14eQR1bAVcDFFZZ; user-key=68c44d85-8cac-4072-8369-c117f62d8eb3; cn=0; unpl=V2_ZzNtbURfFhZwXEEAKx4OVWJTElsSUUoUdQsRAHkbWgFmCkEKclRCFnQUR11nGl0UZwQZWEVcQxxFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZH8fWg1lBRpVSmdzEkU4dlN7EFQGZDMTbUNnAUEpCk5Weh5YSGMFFFVAUUsdfThHZHg%3d; __jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_79d24e6ea6ca4a17a78012fe337508bf|1597913171572; __jda=122270672.577937999.1597809411.1597906781.1597909689.7; __jdc=122270672; 3AB9D23F7A4B3C9B=3SJYGJUIMVOSXHMAT54Z7M54MSN7POALYPRYHXXL4OTIUAYWVYTBG6AFPA4L4Q5ED37GELWUAZFAMTA6KV6JQSFCHA; shshshfp=b65beee4eac3565989e568b588a5f619; shshshsID=1d65ff1a1d5a2ecc10479b3f2d1ce72b_39_1597913210747; __jdb=122270672.48.577937999|7.1597909689'
headers = {"User-Agent": ua.random,
'Cookie': cookie}
trytimes = 1 # 重试的次数
for i in range(trytimes):
try:
response = requests.get(url, headers=headers, proxies={"http": "https://{}".format(proxy)}, timeout=1)
# response = requests.get(url, headers=headers,timeout=3)
# 注意此处也可能是302等状态码
if response.status_code == 200:
break
except:
# logdebug(f'requests failed {i}time')
print(f'requests failed {i} time','要获取的URL:',url)
return 'False'
return response.text
def readUrl():
All_URL=[]
with open('E:\\url\\luntai\\JD_LT_URLS.txt','r',encoding='utf-8') as jd:
jd_lines=jd.readlines()
for line in jd_lines:
jd_Href_Name=eval(line.strip())
# href=jd_Href_Name['href_url']
# brand=jd_Href_Name['bran_name']
# price=jd_Href_Name['price']
# print(jd_Href_Name)
All_URL.append(jd_Href_Name)
# ti.sleep(random.random()*1)
# getProduct(href,brand,price)
return All_URL
# def getProduct(https_li_href,brand_name,price):
def getProduct(jd_Href_Name):
# print(jd_Href_Name)
ti.sleep(random.random()*2)
https_li_href=jd_Href_Name['href_url']
brand_name=jd_Href_Name['bran_name']
price=jd_Href_Name['price']
db = "INSERT INTO `xxuan_car_jd_lt_product` VALUES (NULL,"
sql = {'skuid': '',
'name': '',
'brand': '',
'url': '',
'price': '',
'commodity_Name': '',
'image': '',
'netweight': '',
'originplace': '',
'size': '',
'width': '',
'number': '',
'performance': '',
'Flattening': '',
'characteristics':'',
'type':''
}
sql['url']=https_li_href
sql['brand']=brand_name
sql['price']=price
db_HK = f"INSERT INTO `xxuan_car_jd_lt_hk_product` VALUES ('{https_li_href}','{brand_name}');"
product_HTML = getHTML(https_li_href)
if product_HTML=='False':
conneMysql_HK(db_HK)
return
produc_soup = BeautifulSoup(product_HTML, 'html.parser')
# 商品标题名称
sku_name_wrap = produc_soup.find('div', attrs={'class': 'itemInfo-wrap'})
if sku_name_wrap != None:
sku_name = sku_name_wrap.find('div', attrs={'class': 'sku-name'})
if sku_name != None:
sku_name = sku_name.text
sku_name = str(sku_name).strip()
sql['commodity_Name'] = sku_name
# print("商品标题名称:",sku_name)
# print('商品价格:',li_price)
# 商品图片
spec_img = produc_soup.find('img', attrs={'id': 'spec-img'})
if spec_img == None:
spec_img = 'NULL'
else:
spec_img = spec_img['data-origin']
# print("https:",spec_img)
imageURL = f"https:{spec_img}"
if imageURL.__contains__('NULL'):
sql['image'] = f"NULL"
else:
sql['image'] = imageURL
# 商品规格信息
parameter_list = produc_soup.find('ul', attrs={'class': 'parameter2 p-parameter-list'})
if parameter_list != None:
li_all_parameter = parameter_list.findAll('li')
for li in li_all_parameter:
if str(li.text).__contains__('商品名称:'):
if li.text == None:
sql['name'] = 'NULL',
else:
sql['name'] = str(li.text).replace('商品名称:', '')
elif str(li.text).__contains__('商品编号:'):
if li.text == None:
sql['skuid'] = 'NULL',
else:
sql['skuid'] =str(li.text).replace('商品编号:', '')
elif str(li.text).__contains__('商品毛重:'):
if li.text == None:
sql['netweight'] = 'NULL',
else:
sql['netweight'] = str(li.text).replace('商品毛重:', '')
elif str(li.text).__contains__('商品产地:'):
if li.text == None:
sql['originplace'] = 'NULL',
else:
sql['originplace'] = str(li.text).replace('商品产地:', '')
elif str(li.text).__contains__('尺寸:'):
if li.text == None:
sql['size'] = 'NULL',
else:
sql['size'] = str(li.text).replace('尺寸:', '')
elif str(li.text).__contains__('胎面宽度:'):
if li.text == None:
sql['width'] = 'NULL',
else:
sql['width'] = str(li.text).replace('胎面宽度:', '')
elif str(li.text).__contains__('扁平比:'):
if li.text == None:
sql['Flattening'] = 'NULL',
else:
sql['Flattening'] = str(li.text).replace('扁平比:', '')
elif str(li.text).__contains__('货号:'):
if li.text == None:
sql['number'] = 'NULL',
else:
sql['number'] = str(li.text).replace('货号:', '')
elif str(li.text).__contains__('花纹性能:'):
if li.text == None:
sql['performance'] = 'NULL',
else:
sql['performance'] = str(li.text).replace('花纹性能:', '')
elif str(li.text).__contains__('轮胎特性:'):
if li.text == None:
sql['characteristics'] = 'NULL',
else:
sql['characteristics'] = str(li.text).replace('轮胎特性:', '')
elif str(li.text).__contains__('车型类别:'):
if li.text == None:
sql['type'] = 'NULL',
else:
sql['type'] = str(li.text).replace('车型类别:', '')
# print(sql)
for i in sql:
if len(str(sql[i])) == 0:
sql[i] = 'NULL'
if i != "type":
db += f"'{sql[i]}',"
else:
db += f"'{sql[i]}');"
# print(db)
conneMysql(db)
def conneMysql_HK(sql):
conn = pymysql.connect(
host='localhost',
user='root',
password='root',
db='jd_qipei',
charset='utf8',
autocommit=True, # 如果插入数据,, 是否自动提交? 和conn.commit()功能一致。
)
cur = conn.cursor()
try:
insert_sql = sql
cur.execute(insert_sql)
except Exception as e:
print("插入数据失败:", e)
else:
# 如果是插入数据, 一定要提交数据, 不然数据库中找不到要插入的数据;
conn.commit()
print("插入数据成功;")
def conneMysql(sql):
conn = pymysql.connect(
host='localhost',
user='root',
password='root',
db='jd_qipei',
charset='utf8',
autocommit=True, # 如果插入数据,, 是否自动提交? 和conn.commit()功能一致。
)
cur = conn.cursor()
try:
insert_sql = sql
cur.execute(insert_sql)
except Exception as e:
print("插入数据失败:", e)
else:
# 如果是插入数据, 一定要提交数据, 不然数据库中找不到要插入的数据;
conn.commit()
print("插入数据成功;")
if __name__ == '__main__':
urls=readUrl()
pool=Pool(processes=5)
pool.map(getProduct,urls)
刹车片
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import requests
from fake_useragent import UserAgent
import pymysql
import time as ti
import random
def get_proxy():
#这是我个人的代理池
#在这里就不给大家展示了,若有需要可以尝试自己搭建一个代理池,
#不懂得可以问我,后面我也会尽快的出一个教程
return requests.get('http://xxxxxxxxxxx/get/').json()['proxy']
def getHTML(url):
proxy = get_proxy() # 获取代理ip
ua = UserAgent() # 实例化
# 请求头就可以写成
cookie = '__jdu=577937999; areaId=15; ipLoc-djd=15-1213-3410-0; PCSYCityID=CN_330000_330100_330105; shshshfpa=bbe2e678-8333-005c-d01a-b070738f7860-1597809413; shshshfpb=pqSL0Bsl%2FLma%20U3QU6OB1xw%3D%3D; mt_xid=V2_52007VwcUVFVaVFIXQSldVWJWFwVVX05cGx0eQAAyVhRODQhWWQNJH1gEY1QWBwhcWwovShhfBHsCG05eWUNaG0IcVA5mACJQbVhiUh9IGV4MYgMbU1xfV14eQR1bAVcDFFZZ; user-key=68c44d85-8cac-4072-8369-c117f62d8eb3; cn=0; unpl=V2_ZzNtbURfFhZwXEEAKx4OVWJTElsSUUoUdQsRAHkbWgFmCkEKclRCFnQUR11nGl0UZwQZWEVcQxxFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZH8fWg1lBRpVSmdzEkU4dlN7EFQGZDMTbUNnAUEpCk5Weh5YSGMFFFVAUUsdfThHZHg%3d; __jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_79d24e6ea6ca4a17a78012fe337508bf|1597913171572; __jda=122270672.577937999.1597809411.1597906781.1597909689.7; __jdc=122270672; 3AB9D23F7A4B3C9B=3SJYGJUIMVOSXHMAT54Z7M54MSN7POALYPRYHXXL4OTIUAYWVYTBG6AFPA4L4Q5ED37GELWUAZFAMTA6KV6JQSFCHA; shshshfp=b65beee4eac3565989e568b588a5f619; shshshsID=1d65ff1a1d5a2ecc10479b3f2d1ce72b_39_1597913210747; __jdb=122270672.48.577937999|7.1597909689'
headers = {"User-Agent": ua.random,
'Cookie': cookie}
trytimes = 100 # 重试的次数
for i in range(trytimes):
try:
response = requests.get(url, headers=headers, proxies={"http": "https://{}".format(proxy)}, timeout=1)
# response = requests.get(url, headers=headers,timeout=3)
# 注意此处也可能是302等状态码
if response.status_code == 200:
break
except:
# logdebug(f'requests failed {i}time')
print(f'requests failed {i} time','要获取的URL:',url)
return response.text
def readUrl():
with open('E:\\url\\SCP\\JD_SCP_URLS.txt','r',encoding='utf-8') as jd:
jd_lines=jd.readlines()
for line in jd_lines:
jd_Href_Name=eval(line.strip())
href=jd_Href_Name['href_url']
brand=jd_Href_Name['bran_name']
price=jd_Href_Name['price']
skuId=jd_Href_Name['skuId']
# print(jd_Href_Name)
# print('href:',href,' ','bran-name',brand)
# ti.sleep(random.random()*1)
getProduct(href,brand,skuId,price)
def getProduct(https_li_href,brand_name,product_Sku,product_Price):
db = "INSERT INTO `xxuan_car_jd_scp_product` VALUES (NULL,"
sql = {'skuid': '',
'name': '',
'brand': '',
'price':'',
'url': '',
'commodity_Name':'',
'image':'',
'Additivetype':'',
'TypesOfAdditives':'',
'NetContent':'',
'ArticleNumber':'',
'boiling':'',
'package':'',
'GrossWeight':'',
'CommodityOrigin':'',
'process':'',
'Installation':'',
'type':'',
'texture':''
}
sql['url']=https_li_href
sql['brand']=brand_name
sql['price']=product_Price
sql['skuid']=product_Sku
product_HTML = getHTML(https_li_href)
produc_soup = BeautifulSoup(product_HTML, 'html.parser')
# 商品标题名称
sku_name_wrap = produc_soup.find('div', attrs={'class': 'itemInfo-wrap'})
if sku_name_wrap != None:
sku_name = sku_name_wrap.find('div', attrs={'class': 'sku-name'})
if sku_name != None:
sku_name = sku_name.text
sku_name = str(sku_name).strip()
sql['commodity_Name'] = sku_name
# 商品图片
spec_img = produc_soup.find('img', attrs={'id': 'spec-img'})
if spec_img == None:
spec_img = 'NULL'
else:
spec_img = spec_img['data-origin']
# print("https:",spec_img)
imageURL = f"https:{spec_img}"
if imageURL.__contains__('NULL'):
sql['image'] = f"NULL"
else:
sql['image'] = imageURL
# 商品规格信息
parameter_list = produc_soup.find('ul', attrs={'class': 'parameter2 p-parameter-list'})
if parameter_list != None:
li_all_parameter = parameter_list.findAll('li')
for li in li_all_parameter:
# print(li)
if str(li.text).__contains__('商品名称:'):
if li.text == None:
sql['name'] = 'NULL',
else:
sql['name'] = str(li.text).replace('商品名称:', '')
elif str(li.text).__contains__('商品编号:'):
if li.text == None:
sql['skuid'] = 'NULL',
# pass
else:
sql['skuid'] =str(li.text).replace('商品编号:', '')
# pass
elif str(li.text).__contains__('产品类别:'):
if li.text == None:
sql['type'] = 'NULL',
else:
sql['type'] = str(li.text).replace('产品类别:', '')
elif str(li.text).__contains__('包装规格:'):
if li.text == None:
sql['package'] = 'NULL',
else:
sql['package'] = str(li.text).replace('包装规格:', '')
elif str(li.text).__contains__('干湿沸点:'):
if li.text == None:
sql['boiling'] = 'NULL',
else:
sql['boiling'] = str(li.text).replace('干湿沸点:', '')
elif str(li.text).__contains__('货号:'):
if li.text == None:
sql['ArticleNumber'] = 'NULL',
else:
sql['ArticleNumber'] = str(li.text).replace('货号:', '')
elif str(li.text).__contains__('商品毛重:'):
if li.text == None:
sql['GrossWeight'] = 'NULL',
else:
sql['GrossWeight'] = str(li.text).replace('商品毛重:', '')
elif str(li.text).__contains__('商品产地:'):
if li.text == None:
sql['CommodityOrigin'] = 'NULL',
else:
sql['CommodityOrigin'] = str(li.text).replace('商品产地:', '')
elif str(li.text).__contains__('产品工艺:'):
if li.text == None:
sql['process'] = 'NULL',
else:
sql['process'] = str(li.text).replace('产品工艺:', '')
elif str(li.text).__contains__('安装位置:'):
if li.text == None:
sql['Installation'] = 'NULL',
else:
sql['Installation'] = str(li.text).replace('安装位置:', '')
elif str(li.text).__contains__('类别:'):
if li.text == None:
sql['type'] = 'NULL',
else:
sql['type'] = str(li.text).replace('类别:', '')
elif str(li.text).__contains__('材质:'):
if li.text == None:
sql['texture'] = 'NULL',
else:
sql['texture'] = str(li.text).replace('材质:', '')
# print(sql)
for i in sql:
if len(str(sql[i])) == 0:
sql[i] = 'NULL'
if i != "texture":
db += f"'{sql[i]}',"
else:
db += f"'{sql[i]}');"
# print(db)
'''
首先生成插入语句,等写入直接source加载
'''
# with open('E:\\xxuan_car_jd_mobil_product.txt', 'a', encoding='utf-8') as w:
# w.write(db + '\r')
# print(db)
'''
直接插入
'''
# print(db)
conneMysql(db)
def conneMysql(sql):
conn = pymysql.connect(
host='localhost',
user='root',
password='root',
db='jd_qipei',
charset='utf8',
autocommit=True, # 如果插入数据,, 是否自动提交? 和conn.commit()功能一致。
)
cur = conn.cursor()
try:
insert_sql = sql
cur.execute(insert_sql)
except Exception as e:
print("插入数据失败:", e)
else:
# 如果是插入数据, 一定要提交数据, 不然数据库中找不到要插入的数据;
conn.commit()
print("插入数据成功;")
if __name__ == '__main__':
readUrl()
添加剂
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import requests
from fake_useragent import UserAgent
import pymysql
import time as ti
import random
def get_proxy():
#这是我个人的代理池
#在这里就不给大家展示了,若有需要可以尝试自己搭建一个代理池,
#不懂得可以问我,后面我也会尽快的出一个教程
return requests.get('http://xxxxxxxxxxx/get/').json()['proxy']
def getHTML(url):
proxy = get_proxy() # 获取代理ip
ua = UserAgent() # 实例化
# 请求头就可以写成
cookie = '__jdu=577937999; areaId=15; ipLoc-djd=15-1213-3410-0; PCSYCityID=CN_330000_330100_330105; shshshfpa=bbe2e678-8333-005c-d01a-b070738f7860-1597809413; shshshfpb=pqSL0Bsl%2FLma%20U3QU6OB1xw%3D%3D; mt_xid=V2_52007VwcUVFVaVFIXQSldVWJWFwVVX05cGx0eQAAyVhRODQhWWQNJH1gEY1QWBwhcWwovShhfBHsCG05eWUNaG0IcVA5mACJQbVhiUh9IGV4MYgMbU1xfV14eQR1bAVcDFFZZ; user-key=68c44d85-8cac-4072-8369-c117f62d8eb3; cn=0; unpl=V2_ZzNtbURfFhZwXEEAKx4OVWJTElsSUUoUdQsRAHkbWgFmCkEKclRCFnQUR11nGl0UZwQZWEVcQxxFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZH8fWg1lBRpVSmdzEkU4dlN7EFQGZDMTbUNnAUEpCk5Weh5YSGMFFFVAUUsdfThHZHg%3d; __jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_79d24e6ea6ca4a17a78012fe337508bf|1597913171572; __jda=122270672.577937999.1597809411.1597906781.1597909689.7; __jdc=122270672; 3AB9D23F7A4B3C9B=3SJYGJUIMVOSXHMAT54Z7M54MSN7POALYPRYHXXL4OTIUAYWVYTBG6AFPA4L4Q5ED37GELWUAZFAMTA6KV6JQSFCHA; shshshfp=b65beee4eac3565989e568b588a5f619; shshshsID=1d65ff1a1d5a2ecc10479b3f2d1ce72b_39_1597913210747; __jdb=122270672.48.577937999|7.1597909689'
headers = {"User-Agent": ua.random,
'Cookie': cookie}
trytimes = 100 # 重试的次数
for i in range(trytimes):
try:
response = requests.get(url, headers=headers, proxies={"http": "https://{}".format(proxy)}, timeout=1)
# response = requests.get(url, headers=headers,timeout=3)
# 注意此处也可能是302等状态码
if response.status_code == 200:
break
except:
# logdebug(f'requests failed {i}time')
print(f'requests failed {i} time','要获取的URL:',url)
return response.text
def readUrl():
with open('E:\\url\\tjj\\JD_TJJ_URLS.txt','r',encoding='utf-8') as jd:
jd_lines=jd.readlines()
for line in jd_lines:
jd_Href_Name=eval(line.strip())
href=jd_Href_Name['href_url']
brand=jd_Href_Name['bran_name']
price=jd_Href_Name['price']
# skuId=jd_Href_Name['skuId']
# print(jd_Href_Name)
# print('href:',href,' ','bran-name',brand)
# ti.sleep(random.random()*1)
getProduct(href,brand,price)
def getProduct(https_li_href,brand_name,product_Price):
db = "INSERT INTO `xxuan_car_jd_tjj_product` VALUES (NULL,"
sql = {'skuid': '',
'name': '',
'brand': '',
'price':'',
'url': '',
'commodity_Name':'',
'image':'',
'Additivetype':'',
'TypesOfAdditives':'',
'NetContent':'',
'ArticleNumber':'',
'GrossWeight':'',
'CommodityOrigin':''
}
sql['url']=https_li_href
sql['brand']=brand_name
sql['price']=product_Price
product_HTML = getHTML(https_li_href)
produc_soup = BeautifulSoup(product_HTML, 'html.parser')
# 商品标题名称
sku_name_wrap = produc_soup.find('div', attrs={'class': 'itemInfo-wrap'})
if sku_name_wrap != None:
sku_name = sku_name_wrap.find('div', attrs={'class': 'sku-name'})
if sku_name != None:
sku_name = sku_name.text
sku_name = str(sku_name).strip()
sql['commodity_Name'] = sku_name
# 商品图片
spec_img = produc_soup.find('img', attrs={'id': 'spec-img'})
if spec_img == None:
spec_img = 'NULL'
else:
spec_img = spec_img['data-origin']
# print("https:",spec_img)
imageURL = f"https:{spec_img}"
if imageURL.__contains__('NULL'):
sql['image'] = f"NULL"
else:
sql['image'] = imageURL
# 商品规格信息
parameter_list = produc_soup.find('ul', attrs={'class': 'parameter2 p-parameter-list'})
if parameter_list != None:
li_all_parameter = parameter_list.findAll('li')
for li in li_all_parameter:
# print(li)
if str(li.text).__contains__('商品名称:'):
if li.text == None:
sql['name'] = 'NULL',
else:
sql['name'] = str(li.text).replace('商品名称:', '')
elif str(li.text).__contains__('商品编号:'):
if li.text == None:
sql['skuid'] = 'NULL',
# pass
else:
sql['skuid'] =str(li.text).replace('商品编号:', '')
# pass
elif str(li.text).__contains__('添加剂类型:'):
if li.text == None:
sql['Additivetype'] = 'NULL',
else:
sql['Additivetype'] = str(li.text).replace('添加剂类型:', '')
elif str(li.text).__contains__('添加剂种类:'):
if li.text == None:
sql['TypesOfAdditives'] = 'NULL',
else:
sql['TypesOfAdditives'] = str(li.text).replace('添加剂种类:', '')
elif str(li.text).__contains__('净含量:'):
if li.text == None:
sql['NetContent'] = 'NULL',
else:
sql['NetContent'] = str(li.text).replace('净含量:', '')
elif str(li.text).__contains__('货号:'):
if li.text == None:
sql['ArticleNumber'] = 'NULL',
else:
sql['ArticleNumber'] = str(li.text).replace('货号:', '')
elif str(li.text).__contains__('商品毛重:'):
if li.text == None:
sql['GrossWeight'] = 'NULL',
else:
sql['GrossWeight'] = str(li.text).replace('商品毛重:', '')
elif str(li.text).__contains__('商品产地:'):
if li.text == None:
sql['CommodityOrigin'] = 'NULL',
else:
sql['CommodityOrigin'] = str(li.text).replace('商品产地:', '')
# print(sql)
for i in sql:
if len(str(sql[i])) == 0:
sql[i] = 'NULL'
if i != "CommodityOrigin":
db += f"'{sql[i]}',"
else:
db += f"'{sql[i]}');"
# print(db)
'''
首先生成插入语句,等写入直接source加载
'''
# with open('E:\\xxuan_car_jd_mobil_product.txt', 'a', encoding='utf-8') as w:
# w.write(db + '\r')
# print(db)
'''
直接插入
'''
# print(db)
conneMysql(db)
def conneMysql(sql):
conn = pymysql.connect(
host='localhost',
user='root',
password='root',
db='jd_qipei',
charset='utf8',
autocommit=True, # 如果插入数据,, 是否自动提交? 和conn.commit()功能一致。
)
cur = conn.cursor()
try:
insert_sql = sql
cur.execute(insert_sql)
except Exception as e:
print("插入数据失败:", e)
else:
# 如果是插入数据, 一定要提交数据, 不然数据库中找不到要插入的数据;
conn.commit()
print("插入数据成功;")
if __name__ == '__main__':
readUrl()
原厂件
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import requests
from fake_useragent import UserAgent
import pymysql
import time as ti
import random
def get_proxy():
#这是我个人的代理池
#在这里就不给大家展示了,若有需要可以尝试自己搭建一个代理池,
#不懂得可以问我,后面我也会尽快的出一个教程
return requests.get('http://xxxxxxxxxxx/get/').json()['proxy']
def getHTML(url):
proxy = get_proxy() # 获取代理ip
ua = UserAgent() # 实例化
# 请求头就可以写成
cookie = '__jdu=577937999; areaId=15; ipLoc-djd=15-1213-3410-0; PCSYCityID=CN_330000_330100_330105; shshshfpa=bbe2e678-8333-005c-d01a-b070738f7860-1597809413; shshshfpb=pqSL0Bsl%2FLma%20U3QU6OB1xw%3D%3D; mt_xid=V2_52007VwcUVFVaVFIXQSldVWJWFwVVX05cGx0eQAAyVhRODQhWWQNJH1gEY1QWBwhcWwovShhfBHsCG05eWUNaG0IcVA5mACJQbVhiUh9IGV4MYgMbU1xfV14eQR1bAVcDFFZZ; user-key=68c44d85-8cac-4072-8369-c117f62d8eb3; cn=0; unpl=V2_ZzNtbURfFhZwXEEAKx4OVWJTElsSUUoUdQsRAHkbWgFmCkEKclRCFnQUR11nGl0UZwQZWEVcQxxFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZH8fWg1lBRpVSmdzEkU4dlN7EFQGZDMTbUNnAUEpCk5Weh5YSGMFFFVAUUsdfThHZHg%3d; __jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_79d24e6ea6ca4a17a78012fe337508bf|1597913171572; __jda=122270672.577937999.1597809411.1597906781.1597909689.7; __jdc=122270672; 3AB9D23F7A4B3C9B=3SJYGJUIMVOSXHMAT54Z7M54MSN7POALYPRYHXXL4OTIUAYWVYTBG6AFPA4L4Q5ED37GELWUAZFAMTA6KV6JQSFCHA; shshshfp=b65beee4eac3565989e568b588a5f619; shshshsID=1d65ff1a1d5a2ecc10479b3f2d1ce72b_39_1597913210747; __jdb=122270672.48.577937999|7.1597909689'
headers = {"User-Agent": ua.random,
'Cookie': cookie}
trytimes = 100 # 重试的次数
for i in range(trytimes):
try:
response = requests.get(url, headers=headers, proxies={"http": "https://{}".format(proxy)}, timeout=1)
# response = requests.get(url, headers=headers,timeout=3)
# 注意此处也可能是302等状态码
if response.status_code == 200:
break
except:
# logdebug(f'requests failed {i}time')
print(f'requests failed {i} time','要获取的URL:',url)
return response.text
def readUrl():
with open('E:\\url\\YCJ\\JD_YCJ_URLS.txt','r',encoding='utf-8') as jd:
jd_lines=jd.readlines()
for line in jd_lines:
jd_Href_Name=eval(line.strip())
href=jd_Href_Name['href_url']
brand=jd_Href_Name['bran_name']
price=jd_Href_Name['price']
skuId=jd_Href_Name['skuId']
# print(jd_Href_Name)
# print('href:',href,' ','bran-name',brand)
# ti.sleep(random.random()*1)
getProduct(href,brand,skuId,price)
def getProduct(https_li_href,brand_name,product_sku,product_Price):
db = "INSERT INTO `xxuan_car_jd_ycj_product` VALUES (NULL,"
sql = {'skuid': '',
'name': '',
'brand': '',
'freezing': '',
'url': '',
'originplace': '',
'netweight': '',
'price': '',
'commodity_Name': '',
'image': '',
'category': '',
'package':'',
'boiling':'',
'sales':'',
'installation':'',
'transmission':''
}
sql['url']=https_li_href
sql['brand']=brand_name
sql['skuid']=product_sku
sql['price']=product_Price
product_HTML = getHTML(https_li_href)
produc_soup = BeautifulSoup(product_HTML, 'html.parser')
# 商品标题名称
sku_name_wrap = produc_soup.find('div', attrs={'class': 'itemInfo-wrap'})
if sku_name_wrap != None:
sku_name = sku_name_wrap.find('div', attrs={'class': 'sku-name'})
if sku_name != None:
sku_name = sku_name.text
sku_name = str(sku_name).strip()
sql['commodity_Name'] = sku_name
# 商品图片
spec_img = produc_soup.find('img', attrs={'id': 'spec-img'})
if spec_img == None:
spec_img = 'NULL'
else:
spec_img = spec_img['data-origin']
# print("https:",spec_img)
imageURL = f"https:{spec_img}"
if imageURL.__contains__('NULL'):
sql['image'] = f"NULL"
else:
sql['image'] = imageURL
# 商品规格信息
parameter_list = produc_soup.find('ul', attrs={'class': 'parameter2 p-parameter-list'})
if parameter_list != None:
li_all_parameter = parameter_list.findAll('li')
for li in li_all_parameter:
# print(li)
if str(li.text).__contains__('商品名称:'):
if li.text == None:
sql['name'] = 'NULL',
else:
sql['name'] = str(li.text).replace('商品名称:', '')
elif str(li.text).__contains__('商品编号:'):
if li.text == None:
# sql['skuid'] = 'NULL',
pass
else:
# sql['skuid'] =str(li.text).replace('商品编号:', '')
pass
elif str(li.text).__contains__('商品毛重:'):
if li.text == None:
sql['netweight'] = 'NULL',
else:
sql['netweight'] = str(li.text).replace('商品毛重:', '')
elif str(li.text).__contains__('商品产地:'):
if li.text == None:
sql['originplace'] = 'NULL',
else:
sql['originplace'] = str(li.text).replace('商品产地:', '')
elif str(li.text).__contains__('产品类别:'):
if li.text == None:
sql['category'] = 'NULL',
else:
sql['category'] = str(li.text).replace('产品类别:', '')
elif str(li.text).__contains__('冰点:'):
if li.text == None:
sql['freezing'] = 'NULL',
else:
sql['freezing'] = str(li.text).replace('冰点:', '')
elif str(li.text).__contains__('包装规格:'):
if li.text == None:
sql['package'] = 'NULL',
else:
sql['package'] = str(li.text).replace('包装规格:', '')
elif str(li.text).__contains__('干湿沸点:'):
if li.text == None:
sql['boiling'] = 'NULL',
else:
sql['boiling'] = str(li.text).replace('干湿沸点:', '')
elif str(li.text).__contains__('销售规格:'):
if li.text == None:
sql['sales'] = 'NULL',
else:
sql['sales'] = str(li.text).replace('销售规格:', '')
elif str(li.text).__contains__('安装位置:'):
if li.text == None:
sql['installation'] = 'NULL',
else:
sql['installation'] = str(li.text).replace('安装位置:', '')
elif str(li.text).__contains__('变速箱类型:'):
if li.text == None:
sql['transmission'] = 'NULL',
else:
sql['transmission'] = str(li.text).replace('变速箱类型:', '')
# print(sql)
for i in sql:
if len(str(sql[i])) == 0:
sql[i] = 'NULL'
if i != "transmission":
db += f"'{sql[i]}',"
else:
db += f"'{sql[i]}');"
# print(db)
'''
首先生成插入语句,等写入直接source加载
'''
# with open('E:\\xxuan_car_jd_mobil_product.txt', 'a', encoding='utf-8') as w:
# w.write(db + '\r')
# print(db)
'''
直接插入
'''
# print(db)
conneMysql(db)
def conneMysql(sql):
conn = pymysql.connect(
host='localhost',
user='root',
password='root',
db='jd_qipei',
charset='utf8',
autocommit=True, # 如果插入数据,, 是否自动提交? 和conn.commit()功能一致。
)
cur = conn.cursor()
try:
insert_sql = sql
cur.execute(insert_sql)
except Exception as e:
print("插入数据失败:", e)
else:
# 如果是插入数据, 一定要提交数据, 不然数据库中找不到要插入的数据;
conn.commit()
print("插入数据成功;")
if __name__ == '__main__':
readUrl()
使用多进程请求获取数据
这里就展示一个代码,基本都一样
火花塞
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import requests
from multiprocessing import Pool
from fake_useragent import UserAgent
import pymysql
import time as ti
import random
def get_proxy():
return requests.get('http://xxxxxxxxxxx/get/').json()['proxy']
def getHTML(url):
proxy = get_proxy() # 获取代理ip
ua = UserAgent() # 实例化
# 请求头就可以写成
cookie = '__jdu=577937999; areaId=15; ipLoc-djd=15-1213-3410-0; PCSYCityID=CN_330000_330100_330105; shshshfpa=bbe2e678-8333-005c-d01a-b070738f7860-1597809413; shshshfpb=pqSL0Bsl%2FLma%20U3QU6OB1xw%3D%3D; mt_xid=V2_52007VwcUVFVaVFIXQSldVWJWFwVVX05cGx0eQAAyVhRODQhWWQNJH1gEY1QWBwhcWwovShhfBHsCG05eWUNaG0IcVA5mACJQbVhiUh9IGV4MYgMbU1xfV14eQR1bAVcDFFZZ; user-key=68c44d85-8cac-4072-8369-c117f62d8eb3; cn=0; unpl=V2_ZzNtbURfFhZwXEEAKx4OVWJTElsSUUoUdQsRAHkbWgFmCkEKclRCFnQUR11nGl0UZwQZWEVcQxxFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZH8fWg1lBRpVSmdzEkU4dlN7EFQGZDMTbUNnAUEpCk5Weh5YSGMFFFVAUUsdfThHZHg%3d; __jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_79d24e6ea6ca4a17a78012fe337508bf|1597913171572; __jda=122270672.577937999.1597809411.1597906781.1597909689.7; __jdc=122270672; 3AB9D23F7A4B3C9B=3SJYGJUIMVOSXHMAT54Z7M54MSN7POALYPRYHXXL4OTIUAYWVYTBG6AFPA4L4Q5ED37GELWUAZFAMTA6KV6JQSFCHA; shshshfp=b65beee4eac3565989e568b588a5f619; shshshsID=1d65ff1a1d5a2ecc10479b3f2d1ce72b_39_1597913210747; __jdb=122270672.48.577937999|7.1597909689'
headers = {"User-Agent": ua.random,
'Cookie': cookie}
trytimes = 1000 # 重试的次数
for i in range(trytimes):
try:
response = requests.get(url, headers=headers, proxies={"http": "https://{}".format(proxy)}, timeout=3)
# response = requests.get(url, headers=headers,timeout=3)
# 注意此处也可能是302等状态码
if response.status_code == 200:
break
except:
# logdebug(f'requests failed {i}time')
print(f'requests failed {i} time','要获取的URL:',url)
return response.text
def readUrl():
ALL_URL=[]
with open('E:\\url\\HHS\\JD_HHS_URLS.txt','r',encoding='utf-8') as jd:
jd_lines=jd.readlines()
for line in jd_lines:
jd_Href_Name=eval(line.strip())
ALL_URL.append(jd_Href_Name)
return ALL_URL
def getProduct(jd_Href_Name):
https_li_href = jd_Href_Name['href_url']
if str(https_li_href).__contains__("ccc-x.jd.com"):
https_li_href = https_li_href[len('https:'):]
brand_name = jd_Href_Name['bran_name']
product_Price = jd_Href_Name['price']
product_Sku = jd_Href_Name['skuId']
db = "INSERT INTO `xxuan_car_jd_hhs_product` VALUES (NULL,"
sql = {'skuid': '',
'name': '',
'brand': '',
'price':'',
'url': '',
'commodity_Name':'',
'image':'',
'sales':'',
'material': '',
'type': '',
'ArticleNumber': '',
'GrossWeight': ''
}
sql['url']=https_li_href
sql['brand']=brand_name
sql['price']=product_Price
sql['skuid']=product_Sku
product_HTML = getHTML(https_li_href)
produc_soup = BeautifulSoup(product_HTML, 'html.parser')
# 商品标题名称
sku_name_wrap = produc_soup.find('div', attrs={'class': 'itemInfo-wrap'})
if sku_name_wrap != None:
sku_name = sku_name_wrap.find('div', attrs={'class': 'sku-name'})
if sku_name != None:
sku_name = sku_name.text
sku_name = str(sku_name).strip()
sql['commodity_Name'] = sku_name
# 商品图片
spec_img = produc_soup.find('img', attrs={'id': 'spec-img'})
if spec_img == None:
spec_img = 'NULL'
else:
spec_img = spec_img['data-origin']
# print("https:",spec_img)
imageURL = f"https:{spec_img}"
if imageURL.__contains__('NULL'):
sql['image'] = f"NULL"
else:
sql['image'] = imageURL
# 商品规格信息
parameter_list = produc_soup.find('ul', attrs={'class': 'parameter2 p-parameter-list'})
if parameter_list != None:
li_all_parameter = parameter_list.findAll('li')
for li in li_all_parameter:
# print(li)
if str(li.text).__contains__('商品名称:'):
if li.text == None:
sql['name'] = 'NULL',
else:
sql['name'] = str(li.text).replace('商品名称:', '')
elif str(li.text).__contains__('销售规格:'):
if li.text == None:
sql['sales'] = 'NULL',
else:
sql['sales'] = str(li.text).replace('销售规格:', '')
elif str(li.text).__contains__('产品材质:'):
if li.text == None:
sql['material'] = 'NULL',
else:
sql['material'] = str(li.text).replace('产品材质:', '')
elif str(li.text).__contains__('产品类型:'):
if li.text == None:
sql['type'] = 'NULL',
else:
sql['type'] = str(li.text).replace('产品类型:', '')
elif str(li.text).__contains__('货号:'):
if li.text == None:
sql['ArticleNumber'] = 'NULL',
else:
sql['ArticleNumber'] = str(li.text).replace('货号:', '')
elif str(li.text).__contains__('商品毛重:'):
if li.text == None:
sql['GrossWeight'] = 'NULL',
else:
sql['GrossWeight'] = str(li.text).replace('商品毛重:', '')
for i in sql:
if len(str(sql[i])) == 0:
sql[i] = 'NULL'
if i != "GrossWeight":
db += f"'{sql[i]}',"
else:
db += f"'{sql[i]}');"
conneMysql(db)
def conneMysql(sql):
conn = pymysql.connect(
host='localhost',
user='root',
password='root',
db='jd_qipei',
charset='utf8',
autocommit=True, # 如果插入数据,, 是否自动提交? 和conn.commit()功能一致。
)
cur = conn.cursor()
try:
insert_sql = sql
cur.execute(insert_sql)
except Exception as e:
print("插入数据失败:", e)
else:
# 如果是插入数据, 一定要提交数据, 不然数据库中找不到要插入的数据;
conn.commit()
print("插入数据成功;")
if __name__ == '__main__':
urls = readUrl()
pool = Pool(processes=10)
pool.map(getProduct, urls)