import re
import pymysql
from lxml import etree
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
brower = webdriver.Chrome()
con = pymysql.connect(host='localhost',user='root',password='',db='taobao',port=3306)
cur = con.cursor()
cur.execute("CREATE TABLE yifu (id int(4) NOT NULL auto_increment PRIMARY KEY ,title VARCHAR(60),prince FLOAT(4,2),people int(10),city VARCHAR(10),shop VARCHAR(20),img VARCHAR(200))")
def search():
'''
功能:打开网页
加载网页
获取输入框对象
获取搜索按钮对象
向输入框对象输入关键字
搜索按钮对象执行点击一次的方法
'''
try:
brower.get('https://www.taobao.com')
in_put = WebDriverWait(brower,10).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR,'#q'))
)
submit = WebDriverWait(brower,10).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR,'#J_TSearchForm > div.search-button > button'))
)
in_put[0].send_keys('衣服')
submit[0].click()
total_page = WebDriverWait(brower,10).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.total'))
)
get_products()
return total_page[0].text
except TimeoutException:
return search()
def next_page(page_num):
'''
功能:执行翻页操作
:param page_num:
:return:
'''
try:
in_put = WebDriverWait(brower,10).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.form > input'))
)[0]
submit = WebDriverWait(brower,10).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit'))
)[0]
in_put.clear()
in_put.send_keys(page_num)
submit.click()
active = WebDriverWait(brower,10).until(
EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page_num))
)
print(active)
get_products()
except TimeoutException:
next_page(page_num)
def get_products():
WebDriverWait(brower,10).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR,'#mainsrp-itemlist > div > div > div:nth-child(1) > div'))
)
html = brower.page_source
html = etree.HTML(html)
items = html.xpath('//*[@id="mainsrp-itemlist"]/div/div/div[1]/div')
for i in items:
product={
'img' : i.xpath('./div/div/div/a/img/@data-src')[0],
'prince' : float(i.xpath('./div[2]/div/div/strong/text()')[0]),
'people' : int(
re.compile('(\d+)').search(
i.xpath('./div[2]/div[1]/div[2]/text()')[0]
).group(1)
),
'title' : i.xpath('./div/div/div/a/img/@alt')[0],
'city' : i.xpath('./div[2]/div[3]/div[2]/text()')[0],
'shop': i.xpath('./div[2]/div[3]/div/a/span[2]/text()')[0]
}
print(product)
cur.execute("INSERT INTO yifu (title,prince,people,city,shop,img) VALUES (%s,%s,%s,%s,%s,%s)",(product['title'],product['prince'],product['people'],product['city'],product['shop'],product['img']))
con.commit()
def main():
total = search()
total = int( re.compile('(\d+)').search(total).group(1) )
print(total)
for i in range(2,total+1):
next_page(i)
con.close()
brower.close()
if __name__ == '__main__':
main()