#python#爬虫加速与非加速实现

爬虫加速与非加速实现

非加速

非加速的实现的爬虫可以有序地获取数据信息,但速度相对比较慢。

完整程序

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import os
from time import time


def getinfo(html,news):
    """获取新闻信息"""
    soup = BeautifulSoup(html,'html.parser')#解析Html文本
    titleL = soup.find_all('a',attrs={'target':'_blank'})#选出所有target=_blank的标签
    timeL = soup.find_all('span',attrs={'class':'c_time'})#选出所有span且class为c_time的标签
    for i in range(len(titleL)-1):#去掉最后一项干扰
        try:
                title = titleL[i].get_text()#获取新闻标题
                link = titleL[i]['href']#获取url
                time = timeL[i].get_text()#获取时间
                news.append([time,title,link])#记录新闻信息           
        except:
            continue
def gethtml(url):
    """获取html页面源码"""
    try:
        options=webdriver.ChromeOptions()
        #添加无界面参数
        options.add_argument('--headless')
        browser = webdriver.Chrome(options=options)
        browser.get(url)
        wait = WebDriverWait(browser,10)#至多等待10秒
        wait.until(EC.presence_of_element_located((By.ID,'d_list'))) #等待页面渲染完成
        text = browser.page_source#获取源码
        browser.close()#关闭浏览器
        return text
    except:
        return ''
if __name__ == '__main__':
    start_time = time()
    if not os.path.exists("爬取新闻测试"):
        os.mkdir("爬取新闻测试")
    print("准备就绪,开始搬砖....",end=" ")
    url = 'https://news.sina.com.cn/roll/#pageid=153&lid=2509&k=&num=50&page='
    n = 8
    news = [['time','title','link']]
    for i in range(n):
        url = url+str(i)
        html = gethtml(url)
        getinfo(html,news)
        print("\r努力搬砖中....已完成:{:0.2f}%".format((i+1)*100/n),end=' ')
    df = pd.DataFrame(news)
    df.to_excel(r"爬取新闻测试\news.xlsx",header=False,index=False)
    end_time = time()
    print("\r新浪新闻爬取完成!!!共耗时{:0.2f}s".format(end_time-start_time))

多线程加速

加速的方法需要分开文件同时爬取,速度较快。

完整程序

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from multiprocessing import Pool
import pandas as pd


def runfunc(url,i,n):
    news = [['time','title','link']]
    filename = 'page'+str(i+1)+'news.xlsx'
    url = url+str(i)
    html = gethtml(url)
    getinfo(html,news)
    df = pd.DataFrame(news)
    df.to_excel(filename,header=False,index=False)
          
def getinfo(html,news):
    soup = BeautifulSoup(html,'html.parser')
    titleL = soup.find_all('a',attrs={'target':'_blank'})
    timeL = soup.find_all('span',attrs={'class':'c_time'})
    num = 0
    for i in range(len(titleL)-1):#去掉最后一项干扰
        try:
                title = titleL[i].get_text()
                link = titleL[i]['href']
                time = timeL[i].get_text()
                news.append([time,title,link])
                num+=1
        except:
            continue
def gethtml(url):
    options=webdriver.ChromeOptions()
    #添加无界面参数
    options.add_argument('--headless')
    browser = webdriver.Chrome(options=options)
    browser.get(url)
    wait = WebDriverWait(browser,10)
    wait.until(EC.presence_of_element_located((By.ID,'d_list'))) 
    text = browser.page_source
    #browser.close()
    return text
          
          
if __name__ == '__main__':
    pool = Pool(8)
    url = 'https://news.sina.com.cn/roll/#pageid=153&lid=2509&k=&num=50&page='
    n = 8
    for i in range(n):
        pool.apply_async(runfunc,args=(url,i,n))
    pool.close()
    pool.join()#阻塞进程
    print("新浪新闻爬取完成!")
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值