# -*- coding: utf-8 -*-
import urllib
from bs4 import BeautifulSoup
import re
"""
"""
def get_all_url(url):
urls=[]
web=urllib.urlopen(url)
soup=BeautifulSoup(web.read())
tags_a=soup.findAll(name='a',attrs={'href':re.compile("^https?://")})
try:
for tag_a in tags_a:
urls.append(tag_a['href'])
except:
pass
return urls
"""
replace("A","B",n):
A替换为B,替换n个。
如果没有参数n默认为全部替换
"""
#得到所有域名是freebuf.com的地址
def get_local_urls(url):
local_urls=[]
urls=get_all_url(url)
for _url in urls:
ret=_url
if 'freebuf.com' in ret.replace('//','').split('/')[0]:
local_urls.append(_url)
return local_urls
#得到所有域名不是freebuf.com的地址
def get_remote_urls(url):
remote_urls=[]
urls=get_all_url(url)
for _url in urls:
ret=_url
if "freebuf.com" not in ret.replace('//','').split('/')[0]:
remote_urls.append(_url)
return remote_urls
def __main__():
url="http://freebuf.com/"
rurls=get_remote_urls(url)
print "-------------------------remote urls---------------"
for ret in rurls:
print ret
print "-------------------------local urls----------------"
lurls=get_local_urls(url)
for ret in lurls:
print ret
爬虫第二章笔记(1)
最新推荐文章于 2020-10-14 16:04:23 发布