测试用例代码

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import time
import urllib, time, os, base64, json
import re, sys
import urllib
from lxml import etree

import requests


def getPage(base_url):
    try:
    	headers = {
            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0"}
        req = urllib.request.Request(base_url, headers=headers)

        page = urllib.request.urlopen(req)  # 5
        content = page.read().decode("utf-8", "ignore").lower()  # 2
        re_tag = re.compile('\<[\S\s]+?\>', re.I)
        re_script = re.compile('\<script[\S\s]+?\</script\>', re.I)  # Script
        re_style = re.compile('\<style[\S\s]+?\</style\>', re.I)  # style
        content = re_script.sub('', content)  # 去掉SCRIPT  2
        content = re_style.sub('', content)  # 去掉style  2
        content = re_tag.sub('', content)  # 去除所有尖括号内的HTML代码 2
        selector = etree.HTML(content.encode("utf-8", 'ignore'))

        # answer two
        menu_items = selector.xpath("/html/body")  # 5
        writefile("/home/output/crawler_result.csv", content)  # 2

    except Exception as e:  # 1
        print("Failed to read from %s." % base_url)
        print(sys.exc_info())
        return False


def writefile(filename, content):
    try:
        fp = open(filename, 'a')  # 5
        fp.write(content + "\n")  # 5
        fp.close()  # 5
    except:
        return False


now = time.strftime('%Y-%m-%d %X', time.localtime(time.time()))

try:
    # 5
    url = 'http://117.73.11.244:9090/'
    getPage(url)

except Exception as e:
    info = '%s\nError: %s' % (now, e)
    writefile('Error.log', info)
    print(info)
    time.sleep(1)
    
    
    
    
    
浪潮优派?浪潮优派o2o线上教育平台浪潮优派首页课程课程包1+x试点政策新闻公告能力测试论坛登录注册登录开始学习吧用户名/手机号/邮箱请输入密码记住我忘记密码?还没有账号?马上注册新用户注册邮箱注册手机号注册请输入邮箱请输入密码-->验证码同意用户协议与法律声明请输入手机号请输入密码向右拖动滑块,进行验证短信验证码我已经接受《注册协议》请接受注册协议已有账号?请直接登录免费领取200元体验课请输入1-14位中英文格式请输入正确的手机号获取验证码立即领取手机号码(*)客户姓名(*)留言新闻列表更多&gt;浪潮1+x证书第一期线上师资培训开课啦2020-05-25107新闻教育部“1+x”证书:《数据采集职业技能等级证...教育部“1+x”证书:《数据采集职业技能等级证书》网络说明会邀请函2020-04-0181公告浪潮出席新技术革命背景下产教融合对话活动11月3日,由山东省教育厅指导、中国教育创新校企联盟举办的新技术革命背景下产教融...2019-11-0425新闻携手新工科产学研联盟,共同推广大数据...10月22日上午,由信息技术新工科产学研联盟和中国软件行业协会联合开展的信息技术...2019-10-22浪潮优派入选首批全国职业教育教师企业...教育部、发改委、工信部、国资委日前公布首批全国职业教育教师企业实践基地名单,山东...2019-10-18人气老师推荐更多&gt;-->选择日期选择时间确认约课暂无约课,请添加须知:暂不支持取消答疑功能。取消确定购买时长立即购买老师正在赶来的路上,请耐心等候…稍后将直接进入直播间该老师临时有事,换一位老师连线吧!×工信部网站copyright@2019-2029山东浪潮优派科技教育有限公司版权所有鲁icp备15009620号关注微信公众号关注微博公众号联系电话:053185106082工作时间:上午9:00——下午5:00



filebeat.inputs:

- type: log
  # to do 
  enabled: true
  # to do
  paths:
    - /app/httpd/logs/access_log

filebeat.config.modules: 
  path: ${path.config}/modules.d/*.yml
 
  reload.enabled: false 

setup.template.settings:
  index.number_of_shards: 3

setup.kibana:

output.logstash:
  # to do The Logstash hosts
  hosts: ["localhost:5045"]

processors:
  - add_host_metadata: ~
  - add_cloud_metadata: ~
  
  


# Sample Logstash configuration for creating a simple
# Beats -> Logstash -> Elasticsearch pipeline.

input {
    # to do
    beats {
	    port => 5045
	  }
}
filter { 
    # to do
	  grok {
		    match => { "message" => "%{HTTPD_COMMONLOG}" }
		}
}
output {
		csv {
			path => "/home/output/httpd-outfile.csv"
			fields => ["clientip", "response"]
			csv_options => {"col_sep" => "    "}
		}
		
    stdout{
        codec => rubydebug
    }
    
}  



##########################################################



iimport re
import requests
import pandas as pd
from lxml import etree


def downloader(url, save_path):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        content = response.content.decode('utf-8', 'ignore').lower()

        re_tag = re.compile('\<[\S\s]+?\>', re.I)
        re_script = re.compile('\<script[\S\s]+?\</script\>', re.I)  # Script
        re_style = re.compile('\<style[\S\s]+?\</style\>', re.I)  # style
        content = re_script.sub('', content)  # 去掉SCRIPT  2
        content = re_style.sub('', content)  # 去掉style  2
        content = re_tag.sub('', content)  # 去除所有尖括号内的HTML代码 2
        content = content.replace("\n", "")
        content = content.replace(" ", "")
        content = content.replace("\t", "")
        tree = etree.HTML(content.encode('utf-8', 'ignore'))
        item = tree.xpath('//html/body')
        save(content, save_path)
    except Exception as e:
        print("抓取数据失败")
        return False


def save(data, save_path):
    try:
        fp = open(save_path, 'a')
        fp.write(data)
        fp.close()
    except:
        return False


if __name__ == "__main__":
    url = 'http://117.73.11.244:9090/'
    save_path = '/home/output/crawler_result.csv'
    result = downloader(url, save_path)





评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值