VS Code+Python 爬取数据并保存到excel(未整理)

import random
import traceback
from bs4 import BeautifulSoup
import requests
import time
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
#测试
def test1():
    import xlwt
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    broswer = webdriver.Edge()
    broswer.get('url')
    wait = WebDriverWait(broswer, 30)
    input_q = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'el-table')))
    #button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.button')))
    #print(input_q.text)
    html=wait._driver.page_source
    soup = BeautifulSoup(html, features="html.parser")
    href = soup.find_all('tr', {'class': 'el-table__row'})#[0].get('href')
    while(len(href)==0):#直到这个class检索到才停止
        input_q = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'el-table')))
        html=wait._driver.page_source
        soup = BeautifulSoup(html, features="html.parser")
        href = soup.find_all('tr', {'class': 'el-table__row'})
    #创建工作表
    book = xlwt.Workbook(encoding='utf-8',style_compression=0)
    sheet = book.add_sheet('数据1',cell_overwrite_ok=True)

    col = ('列1','列2','列3','列4','列5','列6','列7','列8','列9','列10')
    for i in range(0,10):
        sheet.write(0,i,col[i])
    i=0
    line=1
    while(i<100):
        text=href[0]
        text1=href[0].find('td', {'class': 'el-table_1_column_1'})
        t1= href[i].find('td', {'class': 'el-table_1_column_1'}).find_all('div', {'class': 'cell'})
        t2= href[i].find('td', {'class': 'el-table_1_column_2'}).find_all('div', {'class': 'cell'})
        t3= href[i].find('td', {'class': 'el-table_1_column_3'}).find_all('div', {'class': 'cell'})
        t4= href[i].find('td', {'class': 'el-table_1_column_4'}).find_all('div', {'class': 'cell'})
        t5= href[i].find('td', {'class': 'el-table_1_column_5'}).find_all('div', {'class': 'cell'})
        t6= href[i].find('td', {'class': 'el-table_1_column_6'}).find_all('div', {'class': 'cell'})
        t7= href[i].find('td', {'class': 'el-table_1_column_7'}).find_all('div', {'class': 'cell'})
        t8= href[i].find('td', {'class': 'el-table_1_column_8'}).find_all('div', {'class': 'cell'})
        t9= href[i].find('td', {'class': 'el-table_1_column_9'}).find_all('div', {'class': 'cell'})
        t10= href[i].find('td', {'class': 'el-table_1_column_10'}).find_all('div', {'class': 'cell'})
        ts1=str(t1[0].contents).replace("['","")
        ts1=ts1.replace("']","")
        sheet.write(line,0,ts1)
        ts2=str(t2[0].contents).replace("['","")
        ts2=ts2.replace("']","")
        sheet.write(line,1,ts2)
        ts3=str(t3[0].contents).replace("['","")
        ts3=ts3.replace("']","")
        sheet.write(line,2,ts3)
        ts4=str(t4[0].contents).replace("['","")
        ts4=ts4.replace("']","")
        sheet.write(line,3,ts4)
        ts5=str(t5[0].contents).replace("['","")
        ts5=ts5.replace("']","")
        sheet.write(line,4,ts5)
        ts6=str(t6[0].contents).replace("['","")
        ts6=ts6.replace("']","")
        sheet.write(line,5,ts6)
        ts7=str(t7[0].contents).replace("['","")
        ts7=ts7.replace("']","")
        sheet.write(line,6,ts7)
        ts8=str(t8[0].contents).replace("['","")
        ts8=ts8.replace("']","")
        sheet.write(line,7,ts8)
        ts9=str(t9[0].contents).replace("['","")
        ts9=ts9.replace("']","")
        ts9=ts9.replace("[]","无")
        sheet.write(line,8,ts9)
        ts10=str(t10[0].contents).replace("['","")
        ts10=ts10.replace("']","")
        ts10=ts10.replace("[]","无")
        sheet.write(line,9,ts10)
        info=str(ts1)+" "+str(ts2)+" "+str(ts3)+" "+str(ts4)+" "+str(ts5)+" "+str(ts6)+" "+str(ts7)+" "+str(ts8)+" "+str(ts9)+" "+str(ts10)
        print(info)
        i+=1    
        line+=1

    page=1
    sheetIndex=2
    while(page<1987):#1987
        #按钮点击下一页
        objBtn=broswer.find_elements(By.CLASS_NAME, "btn-next") #objBtn.click()
        objBtn[0].click()#broswer.execute_script("arguments[0].click();", objBtn)
        r=random.randint(1,3)#表太过份,拿了就走,建议N>=3
        time.sleep(r)
        input_q = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'el-table')))
        #print(input_q.text)
        time.sleep(3)
        html=wait._driver.page_source
        soup = BeautifulSoup(html, features="html.parser")
        href = soup.find_all('tr', {'class': 'el-table__row'})
        while(len(href)==1):
            input_q = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'el-table')))
            time.sleep(3)
            html=wait._driver.page_source
            soup = BeautifulSoup(html, features="html.parser")
            href = soup.find_all('tr', {'class': 'el-table__row'})
        
        lens=len(href)
        if(line>30000):
            sheet = book.add_sheet('数据'+str(sheetIndex),cell_overwrite_ok=True)
            sheetIndex+=1
            line=1
            col = ('列1','列2','列3','列4','列5','列6','列7','列8','列9','列10')
            for i in range(0,10):
                sheet.write(0,i,col[i])
        i=0
        while(i<len(href)):
            text=href[0]
            text1=href[0].find('td', {'class': 'el-table_1_column_1'})
            t1= href[i].find('td', {'class': 'el-table_1_column_1'}).find_all('div', {'class': 'cell'})
            t2= href[i].find('td', {'class': 'el-table_1_column_2'}).find_all('div', {'class': 'cell'})
            t3= href[i].find('td', {'class': 'el-table_1_column_3'}).find_all('div', {'class': 'cell'})
            t4= href[i].find('td', {'class': 'el-table_1_column_4'}).find_all('div', {'class': 'cell'})
            t5= href[i].find('td', {'class': 'el-table_1_column_5'}).find_all('div', {'class': 'cell'})
            t6= href[i].find('td', {'class': 'el-table_1_column_6'}).find_all('div', {'class': 'cell'})
            t7= href[i].find('td', {'class': 'el-table_1_column_7'}).find_all('div', {'class': 'cell'})
            t8= href[i].find('td', {'class': 'el-table_1_column_8'}).find_all('div', {'class': 'cell'})
            t9= href[i].find('td', {'class': 'el-table_1_column_9'}).find_all('div', {'class': 'cell'})
            t10= href[i].find('td', {'class': 'el-table_1_column_10'}).find_all('div', {'class': 'cell'})
            ts1=str(t1[0].contents).replace("['","")
            ts1=ts1.replace("']","")
            sheet.write(line,0,ts1)
            ts2=str(t2[0].contents).replace("['","")
            ts2=ts2.replace("']","")
            sheet.write(line,1,ts2)
            ts3=str(t3[0].contents).replace("['","")
            ts3=ts3.replace("']","")
            sheet.write(line,2,ts3)
            ts4=str(t4[0].contents).replace("['","")
            ts4=ts4.replace("']","")
            sheet.write(line,3,ts4)
            ts5=str(t5[0].contents).replace("['","")
            ts5=ts5.replace("']","")
            sheet.write(line,4,ts5)
            ts6=str(t6[0].contents).replace("['","")
            ts6=ts6.replace("']","")
            sheet.write(line,5,ts6)
            ts7=str(t7[0].contents).replace("['","")
            ts7=ts7.replace("']","")
            sheet.write(line,6,ts7)
            ts8=str(t8[0].contents).replace("['","")
            ts8=ts8.replace("']","")
            sheet.write(line,7,ts8)
            ts9=str(t9[0].contents).replace("['","")
            ts9=ts9.replace("']","")
            ts9=ts9.replace("[]","无")
            sheet.write(line,8,ts9)
            ts10=str(t10[0].contents).replace("['","")
            ts10=ts10.replace("']","")
            ts10=ts10.replace("[]","无")
            sheet.write(line,9,ts10)
            info=str(ts1)+" "+str(ts2)+" "+str(ts3)+" "+str(ts4)+" "+str(ts5)+" "+str(ts6)+" "+str(ts7)+" "+str(ts8)+" "+str(ts9)+" "+str(ts10)
            print(info)
            i+=1
            line+=1
        page+=1
    savepath = 'C:/Users/小陈的电脑/Desktop/test.xlsx'
    book.save(savepath)
# 测试所用
test1()

下载模块使用下面这个

import pip
pip.main(["install","xlwt(库名称)"]) 
以下是使用 Python 爬取淘宝 Python 数据分析图书并保存Excel 中的示例代码: ```python import requests import re import xlwt def get_books(): url = 'https://s.taobao.com/search?q=python%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20220106&ie=utf8' headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36' } response = requests.get(url, headers=headers) if response.status_code == 200: # 使用正则表达式解析网页内容 pattern = re.compile('<div class="title">.*?<a href="(.*?)" target="_blank".*?>(.*?)</a>.*?</div>.*?<div class="price g_price g_price-highlight">.*?<strong>(.*?)</strong>.*?<div class="deal-cnt">(.*?)</div>', re.S) result = re.findall(pattern, response.text) return result else: return None def save_to_excel(result): book = xlwt.Workbook(encoding='utf-8', style_compression=0) sheet = book.add_sheet('Python数据分析图书', cell_overwrite_ok=True) row_0 = ['序号', '书名', '价格', '销量', '链接'] for i in range(len(row_0)): sheet.write(0, i, row_0[i]) count = 1 for item in result: sheet.write(count, 0, count) sheet.write(count, 1, item[1]) sheet.write(count, 2, item[2]) sheet.write(count, 3, item[3]) sheet.write(count, 4, item[0]) count += 1 book.save('python_books.xls') if __name__ == '__main__': result = get_books() if result: save_to_excel(result) print('数据保存python_books.xls 文件中!') else: print('爬取失败') ``` 这段代码使用了 requests 库发送 HTTP 请求,并使用正则表达式解析网页内容,最后将结果保存Excel 文件中。需要注意的是,因为淘宝有反爬机制,可能需要加上一些其他的处理方法才能成功爬取
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值