python 基础教程 23章NNTP 3

8 篇文章 0 订阅
3 篇文章 0 订阅
#/usr/bin/env python
#*-*coding:utf-8 *-*


# python 基础教程23章NNTP 23-2 更灵活的新闻收集代理程序
#python2.7 运行


from nntplib import NNTP
from time import strftime, time, localtime
from email import message_from_string
from urllib import urlopen
import  textwrap
import re


day = 24 * 60 * 60


def wrap(string, max=70):
"""
将字符串调整为最大行宽
"""
return '\n'.join(textwrap.wrap(string)) + '\n'


class NewsAgent():
"""
可以从新闻来源获取新闻项目并且发布到新闻目标的对象
"""
def __init__(self):
self.sources = []
self.destinations = []

def addSource(self, source):
self.sources.append(source)
def addDestination(self, dest):
self.destinations.append(dest)

def distribute(self):
"""
从所有来源获取新闻项目并且发布到所有目标
"""
items =[]
for source in self.sources:
items.extend(source.getItems())
for dest in self.destinations:
dest.receiveItems(items)


class NewsItem():
"""
包括主题和主体文本的简单新闻项目
"""
def __init__(self, title, body):
self.title = title
self.body = body


class NNTPSource():
"""
从NNTP组中获取新闻项目的新闻来源
"""
def __init__(self, servername, group, window):
self.servername = servername
self.group = group
self.window = window


def getItems(self):


server = NNTP(self.servername)
(resp, count, frist, last, name) = server.group(self.group)
(resp, subs) = server.xhdr('subject', (str(frist) + '-' +(last)))

for subject in subs[-10:]:
title = subject[1]
(reply, num, id, list) = server.body(subject[0])
body = ''.join(list)


#print(num) #186919
#print(title) #Re: Find out which module a class came from
#print(''.join(list))#prano wrote:> But for merely ordinary obfuscation caused by poor...

yield NewsItem(title, body)
server.quit()


"""
书中原例getItems()方法
返回 nntplib.NNTPTemporaryError: 480 NEWNEWS command disabled by administrator
#480管理员禁用NEWNEWS命令


def getItems(self):
start = localtime(time() - self.window*day)
date = strftime('%y%m%d', start)
hour = strftime('%H%M%S', start)

server = NNTP(self.servername)
ids = server.newnews(self.group, date, hour)[1]

for id in ids:
lines = serverarticle(id)[3]
message = message_from_string('\n'.join(lines))

title = message['subject']
body = message.get_payload()
if message.is_multipat():
body = body[0]

yield NewsItem(title, body)
server.quit()
"""





class SimpleWebSource():
"""
使用正则表达式从网页中提取新闻项目的新闻来源
"""
def __init__(self, url, titlePattern, bodyPattern):
self.url = url 
self.titlePattern = re.compile(titlePattern)
self.bodyPattern = re.compile(bodyPattern)


def getItems(self):
text = urlopen(self.url).read()
titles = self.titlePattern.findall(text)
bodies = self.bodyPattern.findall(text)
for title, body in zip(titles, bodies):
yield NewsItem(title[1], wrap(body[1]))
"""
书中原例 getItems()方法
def getItems(self):
text = urlopen(self.url).read()
titles = self.titlePattern.findall(text)
bodies = self.bodyPattern.findall(text)
for title, body in zip(titles, bodies):
yield NewsItem(title, wrap(body))
"""


class PlainDestination():
"""
将所有新闻项目格式化为纯文本的新闻目标类
"""
def receiveItems(self, items):
for item in items:
print item.title
print '-'*len(item.title)
print item.body


class HTMLDestination():
"""
将所有新闻项目格式化为HTML的目标类
"""
def __init__(self, filename):
self.filename = filename

def receiveItems(self, items):

out = open(self.filename, 'w')

print >> out, """
<html>
<head>
<title>Today's News</title>
</head>
<body>
<h1>Today's News</h1>
"""

print >> out, '<ul>'
id = 0
for item in items:
id += 1
print >> out, '<li><a href="#%i">%s</a></li>' % (id, item.title)
print >> out, '</ul>'

id =0 
for item in items:
id += 1
print >> out, '<h2><a name="%i">%s</a></h2>' % (id, item.title)
print >> out, '<pre>%s</pre>' %item.body

print >> out, """
</body>
</html>
"""


def runDefaultSetup():
"""
来源和目标的默认设置, 可以自己修改
"""
agent = NewsAgent()

#从BBS新闻站获取新闻的SimpleWebSource
bbc_url = 'http://www.bbc.com/news'
bbc_title = r'<h3 class="(.+?)">(.+?)</h3>'
bbc_body = r'<p class="(.+?)">(.+?)</p>'
bbc = SimpleWebSource(bbc_url, bbc_title, bbc_body)

agent.addSource(bbc)

#从 comp.lang.python获取新闻的NNTPSource
"""
NNTP服务器 新闻组
'web.aioe.org', 'comp.lang.python'
'news.gmane.org',  'gmane.comp.python.committers'
"""


clpa_server = 'web.aioe.org' 
clpa_group = 'comp.lang.python'
clpa_window = 1
clpa = NNTPSource(clpa_server, clpa_group, clpa_window)

agent.addSource(clpa)

#增加纯文本目标和HTML目标
agent.addDestination(PlainDestination())
agent.addDestination(HTMLDestination('news.html'))

#发布新闻项目
agent.distribute()


if __name__ == '__main__':
runDefaultSetup()



  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
在NSL-KDD数据集中,包含了许多字符串类型的特征,例如“protocol_type”,“service”,“flag”等等。这些字符串特征需要被转换成数字标签,方便后续的机器学习算法使用。下面是一个将NSL-KDD中的字符串特征转换成数字标签的示例代码: ```python import pandas as pd # 读取NSL-KDD数据集 data = pd.read_csv('kddcup.csv', header=None) # 将字符串特征转换成数字标签 service_mapping = {'aol': 1, 'auth': 2, 'bgp': 3, 'courier': 4, 'csnet_ns': 5, 'ctf': 6, 'daytime': 7, 'discard': 8, 'domain': 9, 'domain_u': 10, 'echo': 11, 'eco_i': 12, 'ecr_i': 13, 'efs': 14, 'exec': 15, 'finger': 16, 'ftp': 17, 'ftp_data': 18, 'gopher': 19, 'harvest': 20, 'hostnames': 21, 'http': 22, 'http_2784': 23, 'http_443': 24, 'http_8001': 25, 'imap4': 26, 'IRC': 27, 'iso_tsap': 28, 'klogin': 29, 'kshell': 30, 'ldap': 31, 'link': 32, 'login': 33, 'mtp': 34, 'name': 35, 'netbios_dgm': 36, 'netbios_ns': 37, 'netbios_ssn': 38, 'netstat': 39, 'nnsp': 40, 'nntp': 41, 'ntp_u': 42, 'other': 43, 'pm_dump': 44, 'pop_2': 45, 'pop_3': 46, 'printer': 47, 'private': 48, 'red_i': 49, 'remote_job': 50, 'rje': 51, 'shell': 52, 'smtp': 53, 'sql_net': 54, 'ssh': 55, 'sunrpc': 56, 'supdup': 57, 'systat': 58, 'telnet': 59, 'tftp_u': 60, 'tim_i': 61, 'time': 62, 'urh_i': 63, 'urp_i': 64, 'uucp': 65, 'uucp_path': 66, 'vmnet': 67, 'whois': 68, 'X11': 69, 'Z39_50': 70} data[1] = data[1].map(service_mapping) protocol_mapping = {'tcp': 1, 'udp': 2, 'icmp': 3} data[2] = data[2].map(protocol_mapping) flag_mapping = {'OTH': 1, 'REJ': 2, 'RSTO': 3, 'RSTOS0': 4, 'RSTR': 5, 'S0': 6, 'S1': 7, 'S2': 8, 'S3': 9, 'SF': 10, 'SH': 11} data[3] = data[3].map(flag_mapping) # 输出转换后的数据 print(data.head()) ``` 在上面的代码中,我们使用了字典(mapping)的方式将字符串特征转换成数字标签。例如,将“service”特征中的“ftp”转换成数字标签17。最终输出的数据是一个经过转换的数据集,其中字符串特征已经被转换成了数字标签。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值