使用python正则式获取text.txt里面的内容,text.txt的内容和正则式应用的相关代码如下
1 <html> 2 <head> 3 <title> 4 爬虫测试 正则式使用举例 5 </title> 6 </head> 7 <body> 8 <div class="topic"><a href="">正则式的简单应用</a> 9 <div class="list"> 10 <ul> 11 <li><a href="http://arvinshaffer.com/1.html">这是第一条</a></li> 12 <li><a href="http://arvinshaffer.com/2.html">这是第二条</a></li> 13 <li><a href="http://arvinshaffer.com/3.html">这是第三条</a></li> 14 </ul> 15 </div 16 </div> 17 </body> 18 </html>
1 import re 2 3 old_url = 'http://www.arvinshaffer.com/course/android/?pageNum=2' 4 total_page = 20 5 6 f = open('text.txt','r') 7 html = f.read() 8 f.close() 9 10 #爬取标题 11 title = re.search('<title>(.*?)</title>',html,re.S).group(1) 12 print title 13 14 #爬取链接 15 links = re.findall('href="(.*?)"',html,re.S) 16 for each in links: 17 print each 18 19 #抓取部分文字,先大再小 20 text_fied = re.findall('<ul>(.*?)</ul>',html,re.S)[0] 21 the_text = re.findall('">(.*?)</a>',text_fied,re.S) 22 for every_text in the_text: 23 print every_text 24 25 #sub实现翻页 26 for i in range(2,total_page+1): 27 new_link = re.sub('pageNum=\d+','pageNum=%d'%i,old_url,re.S) 28 print "这是新链接",new_link