爬取代码:
import pandas from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.common.by import By from bs4 import BeautifulSoup import requests, sys from time import sleep from selenium import webdriver from lxml import etree import csv from selenium.webdriver.common.action_chains import ActionChains import pymysql import codecs import pandas as pd options = webdriver.ChromeOptions() # 不加载图片,加快访问速度 options.add_experimental_option("prefs",{"profile.mamaged_default_content_settings.images":2}) # 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium options.add_experimental_option('excludeSwitches',['enable-automation']) chrome_driver = 'C:/Users/admin/Virtualenv/flask-env/Scripts/chromedriver.exe' #chromedriver的文件位置 driver = webdriver.Chrome(executable_path = chrome_driver) # print('导入数据库') # connect = pymysql.connect(host='localhost', user='root', password='112121', db='nongchanpin', port=3306) # cursor = connect.cursor() # print("连接数据库成功") url = 'http://www.vipveg.com/price/2018/' # 首页url yue=[] liu=[] driver.get(url) # 请求首页面 driver.maximize_window() driver.implicitly_wait(3) driver.find_element_by_xpath('/html/body/div/table[6]/tbody/tr/td[2]/table[1]/tbody/tr[2]/td/table/tbody/tr[4]/td[2]/a[2]').click() try: for i in range(1,13): driver.find_element_by_xpath('/html/body/div/table[6]/tbody/tr/td[2]/table[1]/tbody/tr[2]/td/table/tbody/tr[2]/td[2]/a['+str(i)+']').click() page=1 sleep(1) while True: try: driver.find_element_by_xpath('//*[@id="pager"]/span/input').clear() driver.find_element_by_xpath('//*[@id="pager"]/span/input').send_keys(page) sleep(1) page += 1 action = ActionChains(driver) action.key_down(Keys.ENTER).key_up(Keys.ENTER).perform() sleep(2) for row in range(1,26): wupin=driver.find_element_by_xpath('/html/body/div/table[6]/tbody/tr/td[2]/table[2]/tbody/tr[2]/td/table/tbody/tr[2]/td/table/tbody/tr['+str(row)+']/td[1]').text shichang=driver.find_element_by_xpath('/html/body/div/table[6]/tbody/tr/td[2]/table[2]/tbody/tr[2]/td/table/tbody/tr[2]/td/table/tbody/tr['+str(row)+']/td[2]').text price2=driver.find_element_by_xpath('/html/body/div/table[6]/tbody/tr/td[2]/table[2]/tbody/tr[2]/td/table/tbody/tr[2]/td/table/tbody/tr['+str(row)+']/td[5]').text[1:] time=driver.find_element_by_xpath('/html/body/div/table[6]/tbody/tr/td[2]/table[2]/tbody/tr[2]/td/table/tbody/tr[2]/td/table/tbody/tr['+str(row)+']/td[6]').text price=float(price2)*2 print(time) with open("2018price.csv", 'a', encoding="gbk") as f1: print("{},{},{},{},{},{}".format(wupin,'蔬菜','元/公斤',price2,shichang,time), file=f1) f1.close() # cursor.execute( # 'insert into 2021price(名称,类别,单位,价格,市场名称,采集时间)VALUES ("{}","{}","{}","{}","{}","{}")'.format(wupin,'蔬菜','元/公斤',price2,shichang,time)) # connect.commit() except: yue.append(13-i) liu.append(page) break except: pass print(yue) print(liu) # cursor.close() # connect.close()