声明:代码仅作学习交流用途,代码分享者与创作者不承担任何由他人恶意运行而导致的责任,勿擅自修改限制频率的参数,勿恶意攻击网页,请学习浏览者遵守社会公德与法律秩序,爬虫导致的网页崩溃等损失由计算机操作者负全部责任,造成严重后果的需要承担刑事责任
爬虫代写:邮箱 leon_leon@yeah.net
全国农产品商务信息公共服务平台爬取
import requests
from fake_useragent import UserAgent
from lxml import etree
from time import sleep
from random import randint
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
#from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from multiprocessing import Process
import threading
import re
from tqdm import tqdm
from selenium.webdriver.chrome.options import Options
#url_base = 'https://nc.mofcom.gov.cn/jghq/priceList?craftName=%E7%8C%AA%E8%82%89'猪肉
#url_base = 'https://nc.mofcom.gov.cn/jghq/priceList?craftName=%E7%BE%8A%E8%82%89'羊肉
#url_base = 'https://nc.mofcom.gov.cn/jghq/priceList?craftName=%E7%8E%89%E7%B1%B3'#玉米
#url_base = 'https://nc.mofcom.gov.cn/jghq/priceList?craftName=%E7%99%BD%E6%9D%A1%E9%B8%A1'#鸡肉
url_base = 'https://nc.mofcom.gov.cn/jghq/priceList?craftName=%E9%B8%A1%E8%9B%8B'#鸡蛋
options = Options()
UA = UserAgent().edge
options.add_argument('''user-agent='{}' '''.format(UA))
# options.add_argument('''proxy-server={}'''.format(proxy)) # 124.236.111.11:80
options.binary_location = "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe"
edge = webdriver.Chrome(options=options) # executable_path="D:\Program Files\python3.7\chromedriver.exe"
edge.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver',{
get: () => undefined
})
"""
})
edge = webdriver.Chrome(options=options)
edge.get(url_base)
edge.find_element(By.XPATH, '//*[@id="eudName"]').click()
edge.find_element(By.XPATH, '/html/body/div[3]/div[1]/a[2]').click()
edge.find_element(By.XPATH, '/html/body/div[3]/div[2]/ul[2]/li[2]').click()
edge.find_element(By.XPATH, '//*[@id="searchForm"]/div/div[3]/div[1]/div/input').click()
#edge.find_element(By.XPATH, '//*[@id="searchForm"]/div/div[3]/div[1]/dl/dd[4]').click()
edge.find_element(By.XPATH, '//*[@id="searchForm"]/div/div[3]/div[1]/dl/dd[3]').click()
#edge.find_element(By.XPATH, '//*[@id="layui-laydate1"]/div[1]/div[2]/table/tbody/tr[1]/td[6]').click()
#edge.find_element(By.XPATH, '//*[@id="layui-laydate1"]/div[2]/div[2]/table/tbody/tr[1]/td[2]').click()
edge.find_element(By.XPATH, '//*[@id="searchBtn"]').click()
data_all = []
product_all = []
price_all = []
market_all=[]
sleep(2)
while True:
html = edge.page_source
e = etree.HTML(html)
data = e.xpath('''//table[@class='table-01 mt30']/tbody[1]/tr/td[1]/text()''')
product = e.xpath('''//table[@class='table-01 mt30']/tbody[1]/tr/td[2]/span/text()''')
price = e.xpath('''//*[@id="showList"]/table/tbody/tr/td[3]/span/text()''')
market = e.xpath('''//*[@id="showList"]/table/tbody/tr/td[4]/a/text()''')
print(data)
data_all = data_all + data
product_all = product_all + product
price_all = price_all + price
market_all = market_all + market
print(data_all)
sleep(5)
if e.xpath('''//*[@id="pageFooter"]/a[last()-1]/text()''') == ['下一页']:
edge.find_element(By.XPATH, '''/html/body/div[2]/div/div[1]/div[3]/a[last()-1]''').click()#//*[@id="pageFooter"]/a[9]#/html/body/div[2]/div/div[1]/div[3]/a[9]
else:
break
# edge.find_element(By.XPATH, '''//*[@id="pageFooter"]/a[last()-1]''').click()
all_info = {
'数据年月': data_all,
'产品': product_all,
'价格': price_all,
'市场': market_all
}
outdata = pd.DataFrame(all_info)
outdata.to_csv('C:\\Users\\Admin\\PycharmProjects\\untitled\\鸡蛋价格.csv', encoding='GBK')