目标,豆瓣读书,
下载页面书籍图片。
import urllib.request
import re #使用正则表达式
def getJpg(date):
jpgList = re.findall(r'(img src="http.+?.jpg")([\s\S]*?)(.+?.alt=".+?.")',date)
return jpgList
def downLoad(jpgUrl,sTitle,n):
try:
urllib.request.urlretrieve(jpgUrl,\
'C:\\Users\\74172\\source\\repos\\Python\\spidertest1\\images\\book.douban\\%s.jpg' %sTitle)
except Exception as e:
print(e)
finally:
print('图片%s下载操作完成' % n)
def getTitle(date):
titleList = re.findall(r'title=".">',date)
return titleList
if __name__ == '__main__':
url = 'https://book.douban.com/'
res = urllib.request.urlopen(url)
date = res.read().decode('utf-8')
date_jpg = getJpg(date)
imageTitle = getTitle(date)
global n
n = 1
for jpginfo in date_jpg:
s = re.findall(r'http.+?.jpg',str(jpginfo))
print(n,'--- url -->',str(s)[2:-2])
sTitleInfo = re.findall(r'alt=".+?."',str(jpginfo))
sTitleL = re.findall(r'".+?."',str(sTitleInfo))
sTitle = str(sTitleL)[3:-3]
downLoad(s[0],sTitle,n)
n = n + 1
又做了点修改,并将书名写入txt文件中
import urllib.request
import re #使用正则表达式
def getJpg(html):
jpgList = re.findall(r'(img src="http.+?.jpg")([\s\S]*?)(.+?.alt=".+?.")',html)
jpgList = re.findall(r'http.+?.jpg',str(jpgList))
return jpgList
def downLoad(jpgUrl,sTitle,n):
try:
urllib.request.urlretrieve(jpgUrl,\
'C:/Users/74172/source/repos/Python/spidertest1/images/book.douban/%s.jpg' %sTitle)
finally:
print('图片---%s----下载操作完成' % sTitle)
def getTitle(html):
titleList = re.findall(r'(img src="http.+?.jpg")([\s\S]*?)(.+?.alt=".+?.")',html)
titleList = re.findall(r'alt=".+?."',str(titleList))
titleList = re.findall(r'".+?."',str(titleList))
return titleList
def writeTxt(imageTitle):
try:
#目录建立txt文件
f = open((url[8:-5]+'.txt'),"a",encoding="utf-8")
#写入
f.write(imageTitle+'\n')
finally:
if f:
#关闭文件
f.close()
if __name__ == '__main__':
url = 'https://book.douban.com/'
res = urllib.request.urlopen(url)
html = res.read().decode('utf-8')
urlJpgs = getJpg(html)
imageTitle = getTitle(html)
n = 0
for urlJpg in urlJpgs:
print(n,'--- url -->',urlJpg)
downLoad(urlJpg,imageTitle[n][1:-1],n)
writeTxt(imageTitle[n][1:-1])
n = n + 1