新闻文本抓取

最新推荐文章于 2024-04-24 09:36:03 发布

会发paper的学渣

最新推荐文章于 2024-04-24 09:36:03 发布

阅读量392

点赞数

分类专栏： NLP 文章标签： chrome python 爬虫

本文链接：https://blog.csdn.net/sslfk/article/details/121908707

版权

NLP 专栏收录该内容

29 篇文章 2 订阅

订阅专栏

1、后台动态渲染抓取页面信息：

option=webdriver.ChromeOptions()
option.add_argument('headless') #
browser = webdriver.Chrome(executable_path='../data/chromedriver/chromedriver.exe',options=option)

2、整体代码块：

import configparser
import os
import re
import sys

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait

from utils.write_es_utils import batch_data

config = configparser.ConfigParser()
config.read('../conf/config.txt', encoding="utf-8")
project_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(project_path)
index_name = config.get("elasticsearch","NEWS")
new_url_list = list()
new_url_list.insert(0,"http://www.people.com.cn/")
new_url_list.insert(0,"https://news.163.com/")
has_get_url_set = set()
def get_all_message_url(url):
    browser.get(url)
    wait = WebDriverWait(browser, 10)
    inner_set = set()
    try:
        wait.until(
            lambda driver: driver.find_elements(By.XPATH, '//a[@href]'))
        outer_current_content = browser.find_elements(By.XPATH,
                                                      '//a[@href]')
        for each in outer_current_content:
            inner_url = each.get_attribute('href').strip()
            if inner_url and inner_url.endswith('.html'):
                inner_set.add(inner_url)

    except Exception as e:
        pass
    return inner_set

def get_all_message(url):
    content = list()
    browser.get(url)
    wait = WebDriverWait(browser, 10)
    try:
        wait.until(
            lambda driver: driver.find_elements(By.XPATH, '//p'))
        current_content = browser.find_elements(By.XPATH, '//p')
        for each in current_content:
            if re.search(r'Copyright|人 民 网 版 权|人民日报社概况|信息服务许可证|服务邮箱',each.text):
                continue
            if each.text and len(each.text.strip())>15:
                content.append({"text":each.text.strip(),"type":"scrap"})
    except Exception as e:
        print(url)
    return list(content)

option=webdriver.ChromeOptions()
option.add_argument('headless') #
browser = webdriver.Chrome(executable_path='../data/chromedriver/chromedriver.exe',options=option)

input_file = open('../data/news.txt','a+',encoding='utf-8')
while len(new_url_list)>0:
    url = new_url_list.pop(-1)
    if url in has_get_url_set:
        continue
    has_get_url_set.add(url)

#你需要爬取的网页
    try:
        content_list = get_all_message(url)
        if len(content_list)>0:
            batch_data(content_list, index_name)

        content_url = get_all_message_url(url)
        for each in content_url:
            if each not in has_get_url_set:
                new_url_list.insert(0,each)


    except Exception as e:
        pass

相关工具类es：

import configparser
import time
from elasticsearch import Elasticsearch
from elasticsearch import helpers

config = configparser.ConfigParser()
config.read('../conf/config.txt', encoding="utf-8")
es = Elasticsearch(hosts=[config.get("elasticsearch","ES_MESSAGEW")])


def timer(func):
    def wrapper(*args, **kwargs):
        start = time.time()
        res = func(*args, **kwargs)
        print('共耗时约 {:.2f} 秒'.format(time.time() - start))
        return res

    return wrapper


@timer
def batch_data(data_list, index_name, _id_name=None):
    """ 批量写入数据 """
    if _id_name:
        action = [{
            "_index": index_name,
            "_id": data_list[i][_id_name],
            '_type': '_doc',
            "_source": data_list[i]
        } for i in range(len(data_list))]
    else:
        action = [{
            "_index": index_name,
            '_type': '_doc',
            "_source": data_list[i]
        } for i in range(len(data_list))]
    helpers.bulk(es, action)


def create_index(index_name, mapping_settings):
    if es.indices.exists(index_name):
        es.indices.delete(index_name)
    res = es.indices.create(index=index_name, body=mapping_settings)
def create_alias(index,alias):
    es.indices.put_alias(index,alias)

def delete_index(index_name):
    if es.indices.exists(index_name):
        es.indices.delete(index_name)

def delete_alias(index,alias):
    es.indices.delete_alias(index,alias)
def get_alias_index_name(alias):
    if es.indices.exists_alias(alias):
        return es.indices.get_alias(name=alias)
    else:
        return None
def refresh_index(index_name):
    es.indices.refresh(index_name)


def clean_data(index_name):
    body = {
        "query": {
            "match_all": {}
        }
    }
    es.delete_by_query(index_name, body)