1、后台动态渲染抓取页面信息:
option=webdriver.ChromeOptions()
option.add_argument('headless') #
browser = webdriver.Chrome(executable_path='../data/chromedriver/chromedriver.exe',options=option)
2、整体代码块:
import configparser
import os
import re
import sys
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from utils.write_es_utils import batch_data
config = configparser.ConfigParser()
config.read('../conf/config.txt', encoding="utf-8")
project_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(project_path)
index_name = config.get("elasticsearch","NEWS")
new_url_list = list()
new_url_list.insert(0,"http://www.people.com.cn/")
new_url_list.insert(0,"https://news.163.com/")
has_get_url_set = set()
def get_all_message_url(url):
browser.get(url)
wait = WebDriverWait(browser, 10)
inner_set = set()
try:
wait.until(
lambda driver: driver.find_elements(By.XPATH, '//a[@href]'))
outer_current_content = browser.find_elements(By.XPATH,
'//a[@href]')
for each in outer_current_content:
inner_url = each.get_attribute('href').strip()
if inner_url and inner_url.endswith('.html'):
inner_set.add(inner_url)
except Exception as e:
pass
return inner_set
def get_all_message(url):
content = list()
browser.get(url)
wait = WebDriverWait(browser, 10)
try:
wait.until(
lambda driver: driver.find_elements(By.XPATH, '//p'))
current_content = browser.find_elements(By.XPATH, '//p')
for each in current_content:
if re.search(r'Copyright|人 民 网 版 权|人民日报社概况|信息服务许可证|服务邮箱',each.text):
continue
if each.text and len(each.text.strip())>15:
content.append({"text":each.text.strip(),"type":"scrap"})
except Exception as e:
print(url)
return list(content)
option=webdriver.ChromeOptions()
option.add_argument('headless') #
browser = webdriver.Chrome(executable_path='../data/chromedriver/chromedriver.exe',options=option)
input_file = open('../data/news.txt','a+',encoding='utf-8')
while len(new_url_list)>0:
url = new_url_list.pop(-1)
if url in has_get_url_set:
continue
has_get_url_set.add(url)
#你需要爬取的网页
try:
content_list = get_all_message(url)
if len(content_list)>0:
batch_data(content_list, index_name)
content_url = get_all_message_url(url)
for each in content_url:
if each not in has_get_url_set:
new_url_list.insert(0,each)
except Exception as e:
pass
相关工具类es:
import configparser
import time
from elasticsearch import Elasticsearch
from elasticsearch import helpers
config = configparser.ConfigParser()
config.read('../conf/config.txt', encoding="utf-8")
es = Elasticsearch(hosts=[config.get("elasticsearch","ES_MESSAGEW")])
def timer(func):
def wrapper(*args, **kwargs):
start = time.time()
res = func(*args, **kwargs)
print('共耗时约 {:.2f} 秒'.format(time.time() - start))
return res
return wrapper
@timer
def batch_data(data_list, index_name, _id_name=None):
""" 批量写入数据 """
if _id_name:
action = [{
"_index": index_name,
"_id": data_list[i][_id_name],
'_type': '_doc',
"_source": data_list[i]
} for i in range(len(data_list))]
else:
action = [{
"_index": index_name,
'_type': '_doc',
"_source": data_list[i]
} for i in range(len(data_list))]
helpers.bulk(es, action)
def create_index(index_name, mapping_settings):
if es.indices.exists(index_name):
es.indices.delete(index_name)
res = es.indices.create(index=index_name, body=mapping_settings)
def create_alias(index,alias):
es.indices.put_alias(index,alias)
def delete_index(index_name):
if es.indices.exists(index_name):
es.indices.delete(index_name)
def delete_alias(index,alias):
es.indices.delete_alias(index,alias)
def get_alias_index_name(alias):
if es.indices.exists_alias(alias):
return es.indices.get_alias(name=alias)
else:
return None
def refresh_index(index_name):
es.indices.refresh(index_name)
def clean_data(index_name):
body = {
"query": {
"match_all": {}
}
}
es.delete_by_query(index_name, body)