新闻文本抓取

1、后台动态渲染抓取页面信息:

option=webdriver.ChromeOptions()
option.add_argument('headless') #
browser = webdriver.Chrome(executable_path='../data/chromedriver/chromedriver.exe',options=option)

2、整体代码块:

import configparser
import os
import re
import sys

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait

from utils.write_es_utils import batch_data

config = configparser.ConfigParser()
config.read('../conf/config.txt', encoding="utf-8")
project_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(project_path)
index_name = config.get("elasticsearch","NEWS")
new_url_list = list()
new_url_list.insert(0,"http://www.people.com.cn/")
new_url_list.insert(0,"https://news.163.com/")
has_get_url_set = set()
def get_all_message_url(url):
    browser.get(url)
    wait = WebDriverWait(browser, 10)
    inner_set = set()
    try:
        wait.until(
            lambda driver: driver.find_elements(By.XPATH, '//a[@href]'))
        outer_current_content = browser.find_elements(By.XPATH,
                                                      '//a[@href]')
        for each in outer_current_content:
            inner_url = each.get_attribute('href').strip()
            if inner_url and inner_url.endswith('.html'):
                inner_set.add(inner_url)

    except Exception as e:
        pass
    return inner_set

def get_all_message(url):
    content = list()
    browser.get(url)
    wait = WebDriverWait(browser, 10)
    try:
        wait.until(
            lambda driver: driver.find_elements(By.XPATH, '//p'))
        current_content = browser.find_elements(By.XPATH, '//p')
        for each in current_content:
            if re.search(r'Copyright|人 民 网 版 权|人民日报社概况|信息服务许可证|服务邮箱',each.text):
                continue
            if each.text and len(each.text.strip())>15:
                content.append({"text":each.text.strip(),"type":"scrap"})
    except Exception as e:
        print(url)
    return list(content)

option=webdriver.ChromeOptions()
option.add_argument('headless') #
browser = webdriver.Chrome(executable_path='../data/chromedriver/chromedriver.exe',options=option)

input_file = open('../data/news.txt','a+',encoding='utf-8')
while len(new_url_list)>0:
    url = new_url_list.pop(-1)
    if url in has_get_url_set:
        continue
    has_get_url_set.add(url)

#你需要爬取的网页
    try:
        content_list = get_all_message(url)
        if len(content_list)>0:
            batch_data(content_list, index_name)

        content_url = get_all_message_url(url)
        for each in content_url:
            if each not in has_get_url_set:
                new_url_list.insert(0,each)


    except Exception as e:
        pass

相关工具类es:

import configparser
import time
from elasticsearch import Elasticsearch
from elasticsearch import helpers

config = configparser.ConfigParser()
config.read('../conf/config.txt', encoding="utf-8")
es = Elasticsearch(hosts=[config.get("elasticsearch","ES_MESSAGEW")])


def timer(func):
    def wrapper(*args, **kwargs):
        start = time.time()
        res = func(*args, **kwargs)
        print('共耗时约 {:.2f} 秒'.format(time.time() - start))
        return res

    return wrapper


@timer
def batch_data(data_list, index_name, _id_name=None):
    """ 批量写入数据 """
    if _id_name:
        action = [{
            "_index": index_name,
            "_id": data_list[i][_id_name],
            '_type': '_doc',
            "_source": data_list[i]
        } for i in range(len(data_list))]
    else:
        action = [{
            "_index": index_name,
            '_type': '_doc',
            "_source": data_list[i]
        } for i in range(len(data_list))]
    helpers.bulk(es, action)


def create_index(index_name, mapping_settings):
    if es.indices.exists(index_name):
        es.indices.delete(index_name)
    res = es.indices.create(index=index_name, body=mapping_settings)
def create_alias(index,alias):
    es.indices.put_alias(index,alias)

def delete_index(index_name):
    if es.indices.exists(index_name):
        es.indices.delete(index_name)

def delete_alias(index,alias):
    es.indices.delete_alias(index,alias)
def get_alias_index_name(alias):
    if es.indices.exists_alias(alias):
        return es.indices.get_alias(name=alias)
    else:
        return None
def refresh_index(index_name):
    es.indices.refresh(index_name)


def clean_data(index_name):
    body = {
        "query": {
            "match_all": {}
        }
    }
    es.delete_by_query(index_name, body)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

会发paper的学渣

您的鼓励和将是我前进的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值