#-*-coding:utf-8-*-
import requests
from bs4 import BeautifulSoup as bs
import re
def getcontent(url):
r = requests.get(url)
r.encoding = "GBK"
return r.text
def gethtml(content,www='None'):
wwwurl = []
soup = bs(content, 'html.parser')
for link in soup.findAll(name='a',attrs={'class':'ulink'}):
newurl = str(www) + link.get('href')
wwwurl.append(newurl.encode('utf-8'))
return wwwurl
def getftp(content):
soup = bs(content,'html.parser')
#print "title: " + soup.title.string
#print "img: " + soup.findAll(name='img',attrs={'alt':"",'border':"0"})[0].get('src')
for link in soup.findAll(href=re.compile("ftp://")):
ftplink = link.get('href').encode('utf-8')
#print ftplink
return ftplink
#get html infomation
url = 'http://www.dytt8.net/html/gndy/dyzz/index.html'
www = '/'.join(url.split('/')[0:3])
content = getcontent(url)
newurl = gethtml(content, www)
#get ftp download infomation
urltotal = []
for i in newurl:
ct = getcontent(i)
urltotal.append(getftp(ct))
print '\n'.join(urltotal)
转载于:https://my.oschina.net/davisqi/blog/668802