版本一:
使用正则:
正则我本人使用的比较烂,你可以完全用自己的匹配方式
aa="http://dict.youdao.com/search?tab=chn&keyfrom=dict.top&q="
print ("input q! to exit ")
while 1:
word=raw_input(">>>")
if word=="q!":
exit()
else:
word=word.replace(' ','+')
url=aa+word
s=urllib.urlopen(url).read()
ss = "".join(s).replace("\n"," ")
#print type(ss)
#ss = ss.replace("\n"," ")
comm = re.findall(r'<div id="etcTrans" class="trans-container tab-content">(.*?)</div>',ss)
#print comm
con = "".join(comm)
dd = con.replace(" ","")
kk = re.findall(r'[^a-z]',dd)
kk = re.findall(r'[^a-z<>\.//]',dd)
print "".join(kk)
版本二
使用HTMLParser
import re,urllib
from HTMLParser import HTMLParser
class MyHTMLParser(HTMLParser):
def __init__(self):
#self.li = ''
self.readingtitle = 0
HTMLParser.__init__(self)
def handle_starttag(self,tag,attrs):
if tag == 'div':
for (key,value) in attrs:
if ('id','etcTrans') in attrs:
self.readingtitle = 1
def handle_data(self,data):
if self.readingtitle == 1:
print "".join(data).replace("\n","").replace("\r\n","")
#self.li += data
#print self.li
def handle_endtag(self,tag):
if tag == 'div':
self.readingtitle = 0
#def handle_data(self,data):
#print data
aa="http://dict.youdao.com/search?tab=chn&keyfrom=dict.top&q="
print ("input q! to exit ")
while 1:
word=raw_input(">>>")
if word=="q!":
exit()
else:
word=word.replace(' ','+')
url=aa+word
s=urllib.urlopen(url).read()
ss = "".join(s)
#print type(ss)
#ss = ss.replace("\n"," ")
#print s
parser = MyHTMLParser()
#print parser.li
parser.feed(s)
parser.close()
这个版本是完成可以提取出信息的,但是不知为什么老是提取完信息后就抛出错误,请朋友能够运行看看帮我指出错误。