这个适用于没使用帐号登录的情况,稍后更新一下使用帐号模拟登录的下载
这个原理是顺着url往里挖,并将网页的内容进行整理传入本地的文件里
#coding=utf-8
import urllib
import urllib2
import re
import lxml
import codecs
from bs4 import BeautifulSoup
import time
image = []
UserAgent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.2372.400 QQBrowser/9.5.10548.400'
header = {'User-Agent':UserAgent}
urlBase = 'http://www.quanshuwang.com/map/1.html'
urlans = []
urlbook = []
k = 1
try:
req = urllib2.Request(urlBase, headers=header)
r = urllib2.urlopen(req).read().decode('gbk')
BS = BeautifulSoup(r,'lxml')
for strint in BS.find_all(target="_blank"):
urlans.append(str(strint.get('href')))
pattern = re.compile('target="_blank">(.*?)</a> ')
for i in