女友遇奇葩老师要求做网页必须使用30个以上的标签才算几个,于是就有了以下脚本……
#/usr/bin/env python
import re
class HtmlTagCounter:
pattern_str = r'<([!a-zA-Z0-9]{1,16}?)[ >]'
pattern = re.compile(pattern_str)
def __init__(self):
self.__tags_set = set()
self.__tags=[]
def handle_tag(self, tag):
if tag not in self.__tags_set:
self.__tags_set.add(tag)
self.__tags.append(tag)
def scan_line(self, line):
for x in self.pattern.findall(line):
self.handle_tag(x)
def get_tags(self):
return self.__tags
counter = HtmlTagCounter()
while True:
try:
counter.scan_line(raw_input())
except EOFError: # if meet EOF, finish work
break
tags = counter.get_tags()
tags_found = ', '.join( '<%s>' % x for x in tags )
if tags_found == '':
tags_found = '(nothing)'
print 'Found: %s' % tags_found
print 'In total: %d' % len(tags)
XXX@DESKTOP-XXXXXXX MINGW64 ~/workspace/htmltagcounter
$ cat test.html | python htmltagcounter.py
Found: <!DOCTYPE>, <html>, <head>, <meta>, <title>, <basefont>, <body>, <header>, <div>, <em>, <u>, <ul>, <li>, <a>, <hr>, <content>, <img>, <center>, <h2>, <b>, <table>, <tbody>, <tr>, <td>, <abbr>, <h1>, <p>, <i>, <br>, <del>, <footer>, <time>, <address>
In total: 33
XXX@DESKTOP-XXXXXXX MINGW64 ~/workspace/htmltagcounter
$ curl www.csdn.net | python htmltagcounter.py
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
100 98778 100 98778 0 0 267k 0 --:--:-- --:--:-- --:--:-- 280k
Found: <!DOCTYPE>, <html>, <head>, <script>, <meta>, <title>, <link>, <body>, <ins>, <div>, <cite>, <span>, <a>, <input>, <em>, <ul>, <li>, <h4>, <dl>, <dt>, <img>, <dd>, <h2>, <h3>
In total: 24