VS Code+Python 爬取数据并保存到excel（未整理）

最新推荐文章于 2024-09-04 15:19:21 发布

陈同学是个程序员

最新推荐文章于 2024-09-04 15:19:21 发布

阅读量503

点赞数

文章标签： python excel 开发语言 vscode

本文链接：https://blog.csdn.net/weixin_38601677/article/details/128999087

版权

import random
import traceback
from bs4 import BeautifulSoup
import requests
import time
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
#测试
def test1():
    import xlwt
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    broswer = webdriver.Edge()
    broswer.get('url')
    wait = WebDriverWait(broswer, 30)
    input_q = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'el-table')))
    #button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.button')))
    #print(input_q.text)
    html=wait._driver.page_source
    soup = BeautifulSoup(html, features="html.parser")
    href = soup.find_all('tr', {'class': 'el-table__row'})#[0].get('href')
    while(len(href)==0):#直到这个class检索到才停止
        input_q = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'el-table')))
        html=wait._driver.page_source
        soup = BeautifulSoup(html, features="html.parser")
        href = soup.find_all('tr', {'class': 'el-table__row'})
    #创建工作表
    book = xlwt.Workbook(encoding='utf-8',style_compression=0)
    sheet = book.add_sheet('数据1',cell_overwrite_ok=True)

    col = ('列1','列2','列3','列4','列5','列6','列7','列8','列9','列10')
    for i in range(0,10):
        sheet.write(0,i,col[i])
    i=0
    line=1
    while(i<100):
        text=href[0]
        text1=href[0].find('td', {'class': 'el-table_1_column_1'})
        t1= href[i].find('td', {'class': 'el-table_1_column_1'}).find_all('div', {'class': 'cell'})
        t2= href[i].find('td', {'class': 'el-table_1_column_2'}).find_all('div', {'class': 'cell'})
        t3= href[i].find('td', {'class': 'el-table_1_column_3'}).find_all('div', {'class': 'cell'})
        t4= href[i].find('td', {'class': 'el-table_1_column_4'}).find_all('div', {'class': 'cell'})
        t5= href[i].find('td', {'class': 'el-table_1_column_5'}).find_all('div', {'class': 'cell'})
        t6= href[i].find('td', {'class': 'el-table_1_column_6'}).find_all('div', {'class': 'cell'})
        t7= href[i].find('td', {'class': 'el-table_1_column_7'}).find_all('div', {'class': 'cell'})
        t8= href[i].find('td', {'class': 'el-table_1_column_8'}).find_all('div', {'class': 'cell'})
        t9= href[i].find('td', {'class': 'el-table_1_column_9'}).find_all('div', {'class': 'cell'})
        t10= href[i].find('td', {'class': 'el-table_1_column_10'}).find_all('div', {'class': 'cell'})
        ts1=str(t1[0].contents).replace("['","")
        ts1=ts1.replace("']","")
        sheet.write(line,0,ts1)
        ts2=str(t2[0].contents).replace("['","")
        ts2=ts2.replace("']","")
        sheet.write(line,1,ts2)
        ts3=str(t3[0].contents).replace("['","")
        ts3=ts3.replace("']","")
        sheet.write(line,2,ts3)
        ts4=str(t4[0].contents).replace("['","")
        ts4=ts4.replace("']","")
        sheet.write(line,3,ts4)
        ts5=str(t5[0].contents).replace("['","")
        ts5=ts5.replace("']","")
        sheet.write(line,4,ts5)
        ts6=str(t6[0].contents).replace("['","")
        ts6=ts6.replace("']","")
        sheet.write(line,5,ts6)
        ts7=str(t7[0].contents).replace("['","")
        ts7=ts7.replace("']","")
        sheet.write(line,6,ts7)
        ts8=str(t8[0].contents).replace("['","")
        ts8=ts8.replace("']","")
        sheet.write(line,7,ts8)
        ts9=str(t9[0].contents).replace("['","")
        ts9=ts9.replace("']","")
        ts9=ts9.replace("[]","无")
        sheet.write(line,8,ts9)
        ts10=str(t10[0].contents).replace("['","")
        ts10=ts10.replace("']","")
        ts10=ts10.replace("[]","无")
        sheet.write(line,9,ts10)
        info=str(ts1)+" "+str(ts2)+" "+str(ts3)+" "+str(ts4)+" "+str(ts5)+" "+str(ts6)+" "+str(ts7)+" "+str(ts8)+" "+str(ts9)+" "+str(ts10)
        print(info)
        i+=1    
        line+=1

    page=1
    sheetIndex=2
    while(page<1987):#1987
        #按钮点击下一页
        objBtn=broswer.find_elements(By.CLASS_NAME, "btn-next") #objBtn.click()
        objBtn[0].click()#broswer.execute_script("arguments[0].click();", objBtn)
        r=random.randint(1,3)#表太过份，拿了就走，建议N>=3
        time.sleep(r)
        input_q = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'el-table')))
        #print(input_q.text)
        time.sleep(3)
        html=wait._driver.page_source
        soup = BeautifulSoup(html, features="html.parser")
        href = soup.find_all('tr', {'class': 'el-table__row'})
        while(len(href)==1):
            input_q = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'el-table')))
            time.sleep(3)
            html=wait._driver.page_source
            soup = BeautifulSoup(html, features="html.parser")
            href = soup.find_all('tr', {'class': 'el-table__row'})
        
        lens=len(href)
        if(line>30000):
            sheet = book.add_sheet('数据'+str(sheetIndex),cell_overwrite_ok=True)
            sheetIndex+=1
            line=1
            col = ('列1','列2','列3','列4','列5','列6','列7','列8','列9','列10')
            for i in range(0,10):
                sheet.write(0,i,col[i])
        i=0
        while(i<len(href)):
            text=href[0]
            text1=href[0].find('td', {'class': 'el-table_1_column_1'})
            t1= href[i].find('td', {'class': 'el-table_1_column_1'}).find_all('div', {'class': 'cell'})
            t2= href[i].find('td', {'class': 'el-table_1_column_2'}).find_all('div', {'class': 'cell'})
            t3= href[i].find('td', {'class': 'el-table_1_column_3'}).find_all('div', {'class': 'cell'})
            t4= href[i].find('td', {'class': 'el-table_1_column_4'}).find_all('div', {'class': 'cell'})
            t5= href[i].find('td', {'class': 'el-table_1_column_5'}).find_all('div', {'class': 'cell'})
            t6= href[i].find('td', {'class': 'el-table_1_column_6'}).find_all('div', {'class': 'cell'})
            t7= href[i].find('td', {'class': 'el-table_1_column_7'}).find_all('div', {'class': 'cell'})
            t8= href[i].find('td', {'class': 'el-table_1_column_8'}).find_all('div', {'class': 'cell'})
            t9= href[i].find('td', {'class': 'el-table_1_column_9'}).find_all('div', {'class': 'cell'})
            t10= href[i].find('td', {'class': 'el-table_1_column_10'}).find_all('div', {'class': 'cell'})
            ts1=str(t1[0].contents).replace("['","")
            ts1=ts1.replace("']","")
            sheet.write(line,0,ts1)
            ts2=str(t2[0].contents).replace("['","")
            ts2=ts2.replace("']","")
            sheet.write(line,1,ts2)
            ts3=str(t3[0].contents).replace("['","")
            ts3=ts3.replace("']","")
            sheet.write(line,2,ts3)
            ts4=str(t4[0].contents).replace("['","")
            ts4=ts4.replace("']","")
            sheet.write(line,3,ts4)
            ts5=str(t5[0].contents).replace("['","")
            ts5=ts5.replace("']","")
            sheet.write(line,4,ts5)
            ts6=str(t6[0].contents).replace("['","")
            ts6=ts6.replace("']","")
            sheet.write(line,5,ts6)
            ts7=str(t7[0].contents).replace("['","")
            ts7=ts7.replace("']","")
            sheet.write(line,6,ts7)
            ts8=str(t8[0].contents).replace("['","")
            ts8=ts8.replace("']","")
            sheet.write(line,7,ts8)
            ts9=str(t9[0].contents).replace("['","")
            ts9=ts9.replace("']","")
            ts9=ts9.replace("[]","无")
            sheet.write(line,8,ts9)
            ts10=str(t10[0].contents).replace("['","")
            ts10=ts10.replace("']","")
            ts10=ts10.replace("[]","无")
            sheet.write(line,9,ts10)
            info=str(ts1)+" "+str(ts2)+" "+str(ts3)+" "+str(ts4)+" "+str(ts5)+" "+str(ts6)+" "+str(ts7)+" "+str(ts8)+" "+str(ts9)+" "+str(ts10)
            print(info)
            i+=1
            line+=1
        page+=1
    savepath = 'C:/Users/小陈的电脑/Desktop/test.xlsx'
    book.save(savepath)
# 测试所用
test1()

下载模块使用下面这个

import pip
pip.main(["install","xlwt（库名称）"])