最近在学习爬虫,写了一个爬取淘宝1688数据的爬虫,使用selenium与BeautifulSoup,现在简单记录一下
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.support import wait
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.common.by import By
from selenium.webdriver import ActionChains
from bs4 import BeautifulSoup
from hashlib import md5
import requests
import time
import pymysql
import re
import os
import random
import traceback
class Item():
img_url=""
title=""
money=""
year=""
consigment_in_30=""
head_turn=""
the_type=""
factory_name=""
price=""
houmiao=""
xiangying=""
fahuo=""
class get1688():
def __init__(self,name):
self.name=name
self.begin()
def begin(self):
self.getHTML()
def getHTML(self):
driver=webdriver.Chrome()
driver.get("https://www.1688.com")
driver.maximize_window()
try:
input_tag=driver.find_element_by_id("home-header-searchbox")
input_tag.send_keys(self.name)
search=driver.find_element_by_xpath('//*[@id="app"]/div/div[3]/section/div[2]/div/div/div/form/fieldset/div/div[2]/button')
time.sleep(1)
search.click()
num=int(driver.find_element_by_xpath('//*[@id="fui_widget_5"]/span/a[9]').text)
for index in range(num):
try:
driver.find_element_by_xpath('//*[@id="s-module-overlay"]/div[2]/div/div[2]').click()
except:
pass
driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
w=wait.WebDriverWait(driver,15)
w.until(ec.presence_of_element_located((By.CSS_SELECTOR,'#offer60')))
print("共%s页,当前正在爬取第%s页"%(str(num),str(index+1)))
result=[]
lis=driver.find_elements_by_css_selector(".sm-offer-item.sw-dpl-offer-item")
i=1
for li in lis:
#将鼠标悬停在商品上才可以显示所有属性值
ActionChains(driver).move_to_element(driver.find_element_by_id('offer'+str(i))).perform()
time.sleep(random.randint(1,3))
HTML=driver.find_element_by_id('offer'+str(i)).get_attribute('innerHTML')
time.sleep(random.randint(1,3))
item=self.parseOneItem(HTML)
if item != None:
result.append(item)
i+=1
if i % 30 == 0:
self.pipeMysql(result)
print("第%s页爬取成功"%(str(index+1)))
driver.find_element_by_xpath('//*[@id="fui_widget_5"]/span/a[10]').click()
time.sleep(3)
except:
fo=open("D:"+os.path.sep+"log.txt",'a')
fo.write(traceback.format_exc())
fo.write("\n\n")
fo.close()
finally:
driver.close()
driver.quit()
#解析一个商品
def parseOneItem(self,html):
soup=BeautifulSoup(html,'lxml')
try:
img_tag=soup.find('div',class_="sm-offer-photo sw-dpl-offer-photo hover").find('a').find('img')
img_url=img_tag.get("src")
title=img_tag.get('alt')
except:
return None
try:
money_in_30day=str(soup.find('span',class_="sm-offer-trade sw-dpl-offer-trade sm-offer-tradeBt").find('em').string)
except:
money_in_30day=str(soup.find('span',class_="sm-offer-priceNum sw-dpl-offer-priceNum").get("title"))
try:
year=soup.find('a',attrs={"title":"阿里巴巴建议您优先选择诚信通会员"}).string
except:
year="暂无记录"
try:
consigment_in_30=str(soup.find("span",class_="sm-offer-localservice-num").string)
except:
consigment_in_30="暂无记录"
try:
head_turn=soup.find("div",class_='sm-widget-offershopwindowshoprepurchaserate').find_all('span')[2].string
if len(head_turn) > 10:
head_turn="暂无"
except:
head_turn='暂无'
the_type=str(soup.find("div",class_="sm-widget-offershopwindowshoprepurchaserate").find('i').string)
if len(the_type) <=0:#
the_type="暂无"
if the_type == None:
the_type='暂无'
factory_name=soup.find("a",attrs={"t-click-item":"com"}).get("title")
price=""
for price_span in soup.find('div',class_="imgofferresult-hoverBlock").select("div.s-widget-offershopwindowdealinfo.sm-offer-dealInfo")[0].find_all("span"):
temp=str(price_span.find("em").get("title")+":"+price_span.find('i').get("title")+";")
price+=temp
try:
smoffs=soup.find('div',class_="sm-offer-bsr-info").find_all('div',class_="sm-offer-bsr-row")
houmiao=smoffs[0].find('span',class_="sm-offer-bsr-sub").string+":"+smoffs[0].find('span',class_="sm-offer-bsr-value-red").string
xiangying=smoffs[1].find('span',class_="sm-offer-bsr-sub").string+":"+smoffs[0].find('span',class_="sm-offer-bsr-value-red").string
fahou=houmiao=smoffs[2].find('span',class_="sm-offer-bsr-sub").string+":"+smoffs[0].find('span',class_="sm-offer-bsr-value-red").string
except:
return None
item=Item()
item.img_url=img_url
item.title=title
item.money=money_in_30day
item.year=year
item.consigment_in_30=consigment_in_30
item.the_type=the_type
item.factory_name=factory_name
item.price=price
item.houmiao=houmiao
item.xiangying=xiangying
item.fahuo=fahou
item.head_turn=head_turn
return item
#存储进入mysql,host等信息填自己的
def pipeMysql(self,items,host="*",user="*",passwd="*",database="1688"):
conn=pymysql.connect(host,user,passwd,database)
cursor=conn.cursor()
self.table_exists(cursor,conn,self.name)
i=0
for item in items:
if self.table_exists(cursor,conn,self.name):
if not os.path.exists("D:"+os.sep+"img_1688"+os.path.sep+self.name):
os.makedirs("D:"+os.sep+"img_1688"+os.path.sep+self.name)
#去重
new_md5 = md5()
new_md5.update((item.img_url+self.name).encode(encoding='utf-8'))
filepath="D:"+os.sep+"img_1688"+os.path.sep+self.name+os.path.sep+new_md5.hexdigest()+".jpg"
if os.path.exists(filepath):
return
fo=open(filepath,'bw')
response=requests.get(item.img_url)
fo.write(response.content)
fo.close()
sql="insert into "+self.name+" values(%(img_url)s,%(title)s,%(money)s,%(head_turn)s,%(year)s,%(consigment_in_30)s,%(the_type)s,%(factory_name)s,%(price)s,%(houmiao)s,%(xiangying)s,%(fahuo)s)"
cursor.execute(sql,{
"img_url":filepath,
"title":item.title,
"money":item.money,
"head_turn":item.head_turn,
"year":item.year,
"consigment_in_30":item.consigment_in_30,
"the_type":item.the_type,
"factory_name":item.factory_name,
"price":item.price,
"houmiao":item.houmiao,
"xiangying":item.xiangying,
"fahuo":item.fahuo
})
if i%5 ==0:
conn.commit()
cursor.close()
conn.commit()
conn.close()
#判断表是否存在,不存在则新建
def table_exists (self,cursor,conn,table):
sql = "show tables"
cursor.execute(sql)
tables = cursor.fetchall()
tables_list = re.findall('(\'.*?\')',str(tables))
tables_list = [re.sub("'",'',each)for each in tables_list]
#不存在新建表
if table not in tables_list:
try:
sql = '''
CREATE TABLE `'''+self.name+'''` (
`pic_url` varchar(1000),
`商品名称` varchar(2000),
`30天成交额` varchar(255),
`客户回头率` varchar(255),
`店铺成立年份` varchar(255),
`30天平均发货速度` varchar(255),
`经营类型` varchar(255),
`店铺名称` varchar(500),
`区段价格` varchar(1000),
`货描` varchar(255),
`响应` varchar(255),
`发货` varchar(255));
'''
cursor.execute(sql)
conn.commit()
except:
fo=open("D:"+os.path.sep+"log.txt",'a')
fo.write(traceback.format_exc())
fo.write("\n\n")
fo.close()
return False
return True
key=input("查询内容为:")
get1688(key)