#coding:utf-8
from selenium.webdriver.common.by import By #引入判断元素加载模块
from selenium.webdriver.support.ui import WebDriverWait #引入判断元素加载模块
from selenium.webdriver.support import expected_conditions as EC #引入判断元素加载模块
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
import requests
import xlwt
import datetime
from time import sleep
class Spider(object):
def __init__(self):
__browser_url = r'D:\360极速浏览器\360Chrome\Chrome\Application\360chrome.exe' # 浏览器目录地址
chrome_options = Options()
chrome_options.binary_location = __browser_url
self.web = webdriver.Chrome(chrome_options=chrome_options)
self.web.maximize_window() # 浏览器最大化
# chromedriver = "C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe"
# # 这里的driver就是刚刚上面下载的
# os.environ["webdriver.chrome.driver"] = chromedriver
# self.web = webdriver.Chrome(chromedriver)
self.headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.8,en;q=0.6',
'cache-control': 'max-age=0',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36'
}
self.req = requests.Session()
self.cookies = {}
def getYesterday(self): # 获取昨天日期
today = datetime.date.today()
oneday = datetime.timedelta(days=1)
yesterday = today - oneday
return yesterday
def login(self):
wait = WebDriverWait(self.web, 10) # 设置目标元素等待时间
# 设定页面加载限制时间
self.web.set_page_load_timeout(10)
self.web.set_script_timeout(10) # 这两种设置都进行才有效
self.web.get('https://mai.taobao.com/')# 打开请求的url
SubmitQuick = wait.until(EC.element_to_be_clickable((By.ID,'J_SubmitQuick'))) # 等待千牛快速登录加载完成
SubmitQuick.click() #点击千牛快速登录
# self.web.find_element_by_id('J_SubmitQuick').click() #点击千牛快速登录
self.web.get('https://mai.taobao.com/')
cookie = ''
for elem in self.web.get_cookies():
cookie += elem["name"] + "=" + elem["value"] + ";"
if elem["name"] == '_tb_token_':
self.token = elem["value"]
self.cookies = cookie
self.headers['Cookie'] = self.cookies
# self.web.quit()
return self.headers
def Hangye(self,excel):
wait = WebDriverWait(self.web, 10) # 设置目标元素等待时间
url = 'https://sycm.taobao.com/portal/home.htm'
driver = self.web
driver.get(url)
# 将滚动条移动到页面的底部
js = "var q=document.documentElement.scrollTop=100000" #js部分
driver.execute_script(js) #执行js滚动条移动到页面的底部
# 创建工作簿
wbk = xlwt.Workbook(encoding='utf-8', style_compression=0)
# 创建工作表
sheet = wbk.add_sheet('sheet 1', cell_overwrite_ok=True)
sleep (3) # 强制等待3秒再执行下一步
# 等待表格加载完成
table = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, '#floor-industry > div.oui-card-content > div > div:nth-child(3) > div > div.alife-dt-card-common-table > div > div.alife-dt-card-common-table-tableContainer > div > table > tbody')))
print(table)
# 表头(table_top_list包含表头每一列的值)
table_top_list = driver.find_element_by_css_selector(
"#floor-industry > div.oui-card-content > div > div:nth-child(3) > div > div.alife-dt-card-common-table > div > div.alife-dt-card-common-table-tableContainer > div > table > thead > tr").find_elements_by_tag_name('th')
#将数据存入list
top_list = []
for c, top in enumerate(table_top_list):
top_list.append(top.text)
top_list2 = top_list.insert(0,'日期') #插入一列
# 写入表头到sheet 1中,第0行第c列
for c, top in enumerate(top_list):
sheet.write(0, c, top)
count =1 #页面变量
w = 1 #excel行数变量
ShiJian = self.getYesterday()
while (count<11):
print("第",count,"页")
# 表的内容
# 将表的每一行存在table_tr_list中
table_tr_list = driver.find_element_by_css_selector(
"#floor-industry > div.oui-card-content > div > div:nth-child(3) > div > div.alife-dt-card-common-table > div > div.alife-dt-card-common-table-tableContainer > div > table > tbody").find_elements_by_tag_name('tr')
for r, tr in enumerate(table_tr_list, w):
# 将表的每一行的每一列内容存在table_td_list中
table_td_list = tr.find_elements_by_tag_name('td')
#将数据存入list
td_list = []
for c, td in enumerate(table_td_list):
td_list.append(td.text)
td_list2 = td_list.insert(0,str(ShiJian))#插入一列
for c, td in enumerate(td_list):
# 写入表的内容到sheet 1中,第r行第c列
sheet.write(r, c, td)
#点击下一页
driver.find_element_by_css_selector('#floor-industry > div.oui-card-content > div > div:nth-child(3) > div > div.alife-dt-card-common-table > div > div.alife-dt-card-common-table-pagination-container > div > span.oui-pager-next.oui-link-third > i').click()
w = w + 10
count = count + 1
# 保存表格到已有的 excel
wbk.save(excel)
def ZTC(self):
url = 'https://subway.simba.taobao.com/#!/tools/insight/queryresult?kws=%E5%A5%B3%E5%8C%85&tab=tabs-category'
driver = self.web
driver.get(url)
sleep(3) # 强制等待3秒再执行下一步
driver.get(url)
if __name__ == '__main__':
sp = Spider()
sp.login()
ShiJian = sp.getYesterday() # 昨天
excel = r'C:\Users\Administrator\Desktop\关键词\\' + str(ShiJian) + '关键词.xls' #excel路径
sp.Hangye(excel)
sp.ZTC()