#!/usr/bin/env python
# -*- coding: utf-8 -*-
import time
import urllib, time, os, base64, json
import re, sys
import urllib
from lxml import etree
import requests
def getPage(base_url):
try:
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0"}
req = urllib.request.Request(base_url, headers=headers)
page = urllib.request.urlopen(req) # 5
content = page.read().decode("utf-8", "ignore").lower() # 2
re_tag = re.compile('\<[\S\s]+?\>', re.I)
re_script = re.compile('\<script[\S\s]+?\</script\>', re.I) # Script
re_style = re.compile('\<style[\S\s]+?\</style\>', re.I) # style
content = re_script.sub('', content) # 去掉SCRIPT 2
content = re_style.sub('', content) # 去掉style 2
content = re_tag.sub('', content) # 去除所有尖括号内的HTML代码 2
selector = etree.HTML(content.encode("utf-8", 'ignore'))
# answer two
menu_items = selector.xpath("/html/body") # 5
writefile("/home/output/crawler_result.csv", content) # 2
except Exception as e: # 1
print("Failed to read from %s." % base_url)
print(sys.exc_info())
return False
def writefile(filename, content):
try:
fp = open(filename, 'a') # 5
fp.write(content + "\n") # 5
fp.close() # 5
except:
return False
now = time.strftime('%Y-%m-%d %X', time.localtime(time.time()))
try:
# 5
url = 'http://117.73.11.244:9090/'
getPage(url)
except Exception as e:
info = '%s\nError: %s' % (now, e)
writefile('Error.log', info)
print(info)
time.sleep(1)
浪潮优派?浪潮优派o2o线上教育平台浪潮优派首页课程课程包1+x试点政策新闻公告能力测试论坛登录注册登录开始学习吧用户名/手机号/邮箱请输入密码记住我忘记密码?还没有账号?马上注册新用户注册邮箱注册手机号注册请输入邮箱请输入密码-->验证码同意用户协议与法律声明请输入手机号请输入密码向右拖动滑块,进行验证短信验证码我已经接受《注册协议》请接受注册协议已有账号?请直接登录免费领取200元体验课请输入1-14位中英文格式请输入正确的手机号获取验证码立即领取手机号码(*)客户姓名(*)留言新闻列表更多>浪潮1+x证书第一期线上师资培训开课啦2020-05-25107新闻教育部“1+x”证书:《数据采集职业技能等级证...教育部“1+x”证书:《数据采集职业技能等级证书》网络说明会邀请函2020-04-0181公告浪潮出席新技术革命背景下产教融合对话活动11月3日,由山东省教育厅指导、中国教育创新校企联盟举办的新技术革命背景下产教融...2019-11-0425新闻携手新工科产学研联盟,共同推广大数据...10月22日上午,由信息技术新工科产学研联盟和中国软件行业协会联合开展的信息技术...2019-10-22浪潮优派入选首批全国职业教育教师企业...教育部、发改委、工信部、国资委日前公布首批全国职业教育教师企业实践基地名单,山东...2019-10-18人气老师推荐更多>-->选择日期选择时间确认约课暂无约课,请添加须知:暂不支持取消答疑功能。取消确定购买时长立即购买老师正在赶来的路上,请耐心等候…稍后将直接进入直播间该老师临时有事,换一位老师连线吧!×工信部网站copyright@2019-2029山东浪潮优派科技教育有限公司版权所有鲁icp备15009620号关注微信公众号关注微博公众号联系电话:053185106082工作时间:上午9:00——下午5:00
filebeat.inputs:
- type: log
# to do
enabled: true
# to do
paths:
- /app/httpd/logs/access_log
filebeat.config.modules:
path: ${path.config}/modules.d/*.yml
reload.enabled: false
setup.template.settings:
index.number_of_shards: 3
setup.kibana:
output.logstash:
# to do The Logstash hosts
hosts: ["localhost:5045"]
processors:
- add_host_metadata: ~
- add_cloud_metadata: ~
# Sample Logstash configuration for creating a simple
# Beats -> Logstash -> Elasticsearch pipeline.
input {
# to do
beats {
port => 5045
}
}
filter {
# to do
grok {
match => { "message" => "%{HTTPD_COMMONLOG}" }
}
}
output {
csv {
path => "/home/output/httpd-outfile.csv"
fields => ["clientip", "response"]
csv_options => {"col_sep" => " "}
}
stdout{
codec => rubydebug
}
}
##########################################################
iimport re
import requests
import pandas as pd
from lxml import etree
def downloader(url, save_path):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
}
response = requests.get(url, headers=headers)
content = response.content.decode('utf-8', 'ignore').lower()
re_tag = re.compile('\<[\S\s]+?\>', re.I)
re_script = re.compile('\<script[\S\s]+?\</script\>', re.I) # Script
re_style = re.compile('\<style[\S\s]+?\</style\>', re.I) # style
content = re_script.sub('', content) # 去掉SCRIPT 2
content = re_style.sub('', content) # 去掉style 2
content = re_tag.sub('', content) # 去除所有尖括号内的HTML代码 2
content = content.replace("\n", "")
content = content.replace(" ", "")
content = content.replace("\t", "")
tree = etree.HTML(content.encode('utf-8', 'ignore'))
item = tree.xpath('//html/body')
save(content, save_path)
except Exception as e:
print("抓取数据失败")
return False
def save(data, save_path):
try:
fp = open(save_path, 'a')
fp.write(data)
fp.close()
except:
return False
if __name__ == "__main__":
url = 'http://117.73.11.244:9090/'
save_path = '/home/output/crawler_result.csv'
result = downloader(url, save_path)
测试用例代码
于 2024-08-15 20:57:41 首次发布