今天有一个需求就是先得从一批html中抽取出来所有的url,之后匹配出来符合要求的url
这里先简单写一下第一步
第一步:从html抽取url的做法如下
#env/usr/bin python
#encoding:utf-8
'''
功能:抽取html中的链接
'''
import re
def find_all(sourcefile='baidu.txt'):
'''
抽取html中的链接
'''
with open(sourcefile) as sf:
html=sf.read()
compile_rule=re.compile(r"<a.*?href=https://|http://.*? ")
url_list=re.findall(compile_rule, html)
print url_list
for one in url_list:
print one
if __name__ == '__main__':
find_all(sourcefile='baidu.txt')
结果如下:
http://s1.bdstatic.com/r/www/cache/static/home/img/qrcode/zbios_efde696.png)
http://s1.bdstatic.com/r/www/cache/static/home/img/qrcode/nuomi_365eabd.png)
http://s1.bdstatic.com/r/www/cache/static/home/img/qrcode/zbios_x2_9d645d9.png);background-size:60px
http://s1.bdstatic.com/r/www/cache/static/home/img/qrcode/nuomi_x2_55dc5b7.png);background-size:60px
http://s1.bdstatic.com/r/www/cache/static/global/img/icons_5859e57.png)
http://s1.bdstatic.com/r/www/cache/static/global/img/icons_5859e57.png)
http://s1.bdstatic.com/r/www/cache/static/home/img/sugbg_1762fe7.png)
http://s1.bdstatic.com/r/www/cache/static/global/img/pc_direct_42d6311.png)
http://s1.bdstatic.com/r/www/cache/static/global/img/pc_direct_42d6311.png)
http://s1.bdstatic.com/r/www/cache/static/home/img/icons_0c37e9b.png)
http://s1.bdstatic.com/r/www/cache/static/home/img/logos/bdbri_icons_0a62ce1.png)
http://s1.bdstatic.com/r/www/cache/static/home/img/icons_0c37e9b.png)
http://s1.bdstatic.com/r/www/cache/static/global/img/quickdelete_33e3eb8.png)
http://news.baidu.com"
http://www.hao123.com"
http://map.baidu.com"
http://v.baidu.com"
http://tieba.baidu.com"
http://xueshu.baidu.com"
http://www.baidu.com/gaoji/preferences.html"
http://www.baidu.com/more/"
http://news.baidu.com/ns?cl=2&rn=20&tn=news&word="
http://tieba.baidu.com/f?kw=&fr=wwwt"
http://zhidao.baidu.com/q?ct=17&pn=0&tn=ikaslist&rn=10&word=&fr=wwwt"
http://music.baidu.com/search?fr=ps&ie=utf-8&key="
http://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&word="
http://v.baidu.com/v?ct=301989888&rn=20&pn=0&db=0&s=25&ie=utf-8&word="
http://map.baidu.com/m?word=&fr=ps01000"
http://wenku.baidu.com/search?word=&lm=0&od=0&ie=utf-8"
http://home.baidu.com">关于百度</a><a
http://ir.baidu.com">About Baidu</a><a
http://e.baidu.com/?refer=888">百度推广</a></p><p
http://www.baidu.com/duty/"
http://jianyi.baidu.com/"
http://www.beian.gov.cn/portal/registerSystemInfo?recordcode=11000002000001">京公网安备11000002000001号</a> <i
http://s1.bdstatic.com/r/www/cache/static/plugins/every_cookie_a70bc15.js";("Mac68K"==navigator.platform||"MacPPC"==navigator.platform||"Macintosh"==navigator.platform||"MacIntel"==navigator.platform)&&(e="http://s1.bdstatic.com/r/www/cache/static/plugins/every_cookie_mac_82990d4.js"),setTimeout(function(){$.ajax({url:e,cache:!0,dataType:"script"})},0);var
http://s1.bdstatic.com/r/www/cache/static/global/js/all_async_search_0596852.js",n="/script";document.write("<script
[Finished in 0.8s]