selenium爬取中国经济与社会发展统计数据库

最新推荐文章于 2024-02-23 11:52:58 发布

小蜗笔记

最新推荐文章于 2024-02-23 11:52:58 发布

阅读量1.4k

点赞数 1

分类专栏：爬虫实战模块

本文链接：https://blog.csdn.net/qq_42830971/article/details/109599664

版权

爬虫实战模块专栏收录该内容

50 篇文章 11 订阅

订阅专栏

声明：代码仅作学习交流用途，代码分享者与创作者不承担任何由他人恶意运行而导致的责任，勿擅自修改限制频率的参数，勿恶意攻击网页，请学习浏览者遵守社会公德与法律秩序，爬虫导致的网页崩溃等损失由计算机操作者负全部责任，造成严重后果的需要承担刑事责任
爬虫代写：邮箱 leon_leon@yeah.net


from selenium import webdriver
from time import sleep
from lxml import etree
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
import pandas as pd
from random import randint
edge = webdriver.Edge()

edge.get("https://data.cnki.net/NewHome/Index")
#edge.find_element_by_id("Button2").click()
#edge.find_element_by_href("/Area/Home/Index/D12").click()
edge.find_element(By.XPATH, '/html/body/div[1]/div[4]/ul/li[2]/a').click()
sleep(6)
for handle in edge.window_handles:#方法二，始终获得当前最后的窗口
    edge.switch_to.window(handle)

year = [str(i) for i in range(1994,2015)]
month = [str(i) for i in range(1,13)]
for year_index,year_item in enumerate(year):
    Select(edge.find_element(By.ID, "selYear")).select_by_value(year_item)
    sleep(randint(5,9))
    for month_index,month_item in enumerate(month):
        print('已成功获取{}%数据'.format(((1+year_index)*12+(month_index+1))/252*100))
#        edge.find_element(By.XPATH, '//*[@id="J_headroom-n"]/div/a[9]').click()
        edge.find_element(By.XPATH, '''//div[@id='J_conNav09']/div/ul/li[@class='item'][1]/a[@mval={}]'''.format(month_item)).click()
        sleep(randint(10,13))
        for handle in edge.window_handles:  # 方法二，始终获得当前最后的窗口
            edge.switch_to.window(handle)
        html = edge.page_source
        # print(html)
        e = etree.HTML(html)
        row_name = e.xpath('//tbody/tr[position()>2]/td[1]/text()')
        data = e.xpath('//tbody/tr[position()>2]/td[position()>1]/span/text()')
        column_name = e.xpath('//tbody/tr[2]/td/text()')
        df = pd.DataFrame(index=row_name, columns=column_name)
        row_num = len(row_name)
        col_num = len(column_name)
        for i in range(row_num):
            for j in range(col_num):
                df.iloc[i,j] = data[i*col_num + j]
        df.to_csv('{0}年{1}月进出口数据.csv'.format(year_item,month_item), encoding='GBK')
        edge.close()
        handles = edge.window_handles
        edge.switch_to.window(handles[1])

from selenium import webdriver
from time import sleep
from lxml import etree
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
import pandas as pd
from random import randint
edge = webdriver.Edge()

edge.get("https://data.cnki.net/NewHome/Index")
#edge.find_element_by_id("Button2").click()
#edge.find_element_by_href("/Area/Home/Index/D12").click()
edge.find_element(By.XPATH, '/html/body/div[1]/div[4]/ul/li[2]/a').click()
sleep(6)
for handle in edge.window_handles:#方法二，始终获得当前最后的窗口
    edge.switch_to.window(handle)

year = [str(i) for i in range(1994,2015)]
month = [str(i) for i in range(1,13)]
for year_index,year_item in enumerate(year):
    if edge.find_element(By.ID,"Button2").is_displayed():
        edge.find_element(By.ID,"Button2").click()
    Select(edge.find_element(By.ID, "selYear")).select_by_value(year_item)
    sleep(randint(5, 9))
    for month_index,month_item in enumerate(month):
        print('已成功获取{}%数据'.format(((1+year_index)*12+(month_index+1))/252*100))
#        edge.find_element(By.XPATH, '//*[@id="J_headroom-n"]/div/a[9]').click()
        if edge.find_element(By.ID, "Button2").is_displayed():
            edge.find_element(By.ID, "Button2").click()
        edge.find_element(By.XPATH, '''//div[@id='J_conNav09']/div/ul/li[@class='item'][1]/a[@mval={}]'''.format(month_item)).click()
        sleep(randint(10,13))
        for handle in edge.window_handles:  # 方法二，始终获得当前最后的窗口
            edge.switch_to.window(handle)
        html = edge.page_source
        # print(html)
        e = etree.HTML(html)
        row_name = e.xpath('//tbody/tr[position()>2]/td[1]/text()')
        data = e.xpath('//tbody/tr[position()>2]/td[position()>1]/span/text()')
        column_name = e.xpath('//tbody/tr[2]/td/text()')
        df = pd.DataFrame(index=row_name, columns=column_name)
        row_num = len(row_name)
        col_num = len(column_name)
        for i in range(row_num):
            for j in range(col_num):
                df.iloc[i,j] = data[i*col_num + j]
        df.to_csv('{0}年{1}月进出口数据.csv'.format(year_item,month_item), encoding='GBK')
        edge.close()
        handles = edge.window_handles
        edge.switch_to.window(handles[1])


year = [str(i) for i in range(2020,2014,-1)]
month = [str(i) for i in range(1,13)]
for year_index,year_item in enumerate(year):
    if edge.find_element(By.ID,"Button2").is_displayed():
        edge.find_element(By.ID,"Button2").click()
    edge.find_element(By_XPATH,'//*[@id="J_headroom"]/div[2]/div/a[{}]'.format(year_index+1)).click()
    sleep(randint(5, 9))
    for month_index,month_item in enumerate(month):
        print('已成功获取{0},{1}%数据'.format(year_item,month_item))
#        edge.find_element(By.XPATH, '//*[@id="J_headroom-n"]/div/a[9]').click()
        if edge.find_element(By.ID, "Button2").is_displayed():
            edge.find_element(By.ID, "Button2").click()
        edge.find_element(By.XPATH, '''//div[@id='J_conNav09']/div/ul/li[@class='item'][1]/a[@mval={}]'''.format(month_item)).click()
        sleep(randint(10,13))
        for handle in edge.window_handles:  # 方法二，始终获得当前最后的窗口
            edge.switch_to.window(handle)
        html = edge.page_source
        # print(html)
        e = etree.HTML(html)
        row_name = e.xpath('//tbody/tr[position()>2]/td[1]/text()')
        data = e.xpath('//tbody/tr[position()>2]/td[position()>1]/span/text()')
        column_name = e.xpath('//tbody/tr[2]/td/text()')
        df = pd.DataFrame(index=row_name, columns=column_name)
        row_num = len(row_name)
        col_num = len(column_name)
        for i in range(row_num):
            for j in range(col_num):
                df.iloc[i,j] = data[i*col_num + j]
        df.to_csv('{0}年{1}月进出口数据.csv'.format(year_item,month_item), encoding='GBK')
        edge.close()
        handles = edge.window_handles
        edge.switch_to.window(handles[1])
print('爬虫100%')

from selenium import webdriver
from time import sleep
from lxml import etree
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
import pandas as pd
from random import randint
edge = webdriver.Edge()

edge.get("https://data.cnki.net/NewHome/Index")
#edge.find_element_by_id("Button2").click()
#edge.find_element_by_href("/Area/Home/Index/D12").click()
edge.find_element(By.XPATH, '/html/body/div[1]/div[4]/ul/li[2]/a').click()
sleep(6)
for handle in edge.window_handles:#方法二，始终获得当前最后的窗口
    edge.switch_to.window(handle)

year = [str(i) for i in range(1995,2015)]
month = [str(i) for i in range(1,13)]
for year_index,year_item in enumerate(year):
    Select(edge.find_element(By.ID, "selYear")).select_by_value(year_item)
    sleep(randint(4, 6))
    for month_index,month_item in enumerate(month):
        print('已成功获取{0}年{1}月数据'.format(year_item,month_item))
#        edge.find_element(By.XPATH, '//*[@id="J_headroom-n"]/div/a[9]').click()
   #     if edge.find_element(By.ID, "Button2").is_displayed():
   #         edge.find_element(By.ID, "Button2").click()
        edge.find_element(By.XPATH, '''//div[@id='J_conNav09']/div/ul/li[@class='item'][1]/a[@mval={}]'''.format(month_item)).click()
        sleep(randint(7,10))
        for handle in edge.window_handles:  # 方法二，始终获得当前最后的窗口
            edge.switch_to.window(handle)
        html = edge.page_source
        # print(html)
        e = etree.HTML(html)
        row_name = e.xpath('//tbody/tr[position()>2]/td[1]/text()')
        data = e.xpath('//tbody/tr[position()>2]/td[position()>1]//text()')
        column_name = e.xpath('//tbody/tr[2]/td/text()')
        df = pd.DataFrame(index=row_name, columns=column_name)
        row_num = len(row_name)
        col_num = len(column_name)
        for i in range(row_num):
            for j in range(col_num):
                df.iloc[i,j] = data[i*col_num + j]
        df.to_csv('{0}年{1}月进出口数据.csv'.format(year_item,month_item), encoding='GBK')
        edge.close()
        handles = edge.window_handles
        edge.switch_to.window(handles[1])


year = [str(i) for i in range(2020,2014,-1)]
month = [str(i) for i in range(1,13)]
for year_index,year_item in enumerate(year):
  #  if edge.find_element(By.ID,"Button2").is_displayed():
    #    edge.find_element(By.ID,"Button2").click()
    edge.find_element(By_XPATH,'//*[@id="J_headroom"]/div[2]/div/a[{}]'.format(year_index+1)).click()
    sleep(randint(5, 9))
    for month_index,month_item in enumerate(month):
        print('已成功获取{0}年,{1}月数据'.format(year_item,month_item))
#        edge.find_element(By.XPATH, '//*[@id="J_headroom-n"]/div/a[9]').click()
   #     if edge.find_element(By.ID, "Button2").is_displayed():
    #        edge.find_element(By.ID, "Button2").click()
        edge.find_element(By.XPATH, '''//div[@id='J_conNav09']/div/ul/li[@class='item'][1]/a[@mval={}]'''.format(month_item)).click()
        sleep(randint(5,7))
        for handle in edge.window_handles:  # 方法二，始终获得当前最后的窗口
            edge.switch_to.window(handle)
        html = edge.page_source
        # print(html)
        e = etree.HTML(html)
        row_name = e.xpath('//tbody/tr[position()>2]/td[1]/text()')
        data = e.xpath('//tbody/tr[position()>2]/td[position()>1]/span/text()')
        column_name = e.xpath('//tbody/tr[2]/td/text()')
        df = pd.DataFrame(index=row_name, columns=column_name)
        row_num = len(row_name)
        col_num = len(column_name)
        for i in range(row_num):
            for j in range(col_num):
                df.iloc[i,j] = data[i*col_num + j]
        df.to_csv('{0}年{1}月进出口数据.csv'.format(year_item,month_item), encoding='GBK')
        edge.close()
        handles = edge.window_handles
        edge.switch_to.window(handles[1])
print('爬虫100%')