# -*-coding:utf-8-*-
__author__ = 'fankai'
"""
https://time.geekbang.org/column/article/76001
从上面的网址下载王祖贤的电影海报
https://movie.douban.com/subject_search?search_text=%E7%8E%8B%E7%A5%96%E8%B4%A4&cat=1002&start=0
"""
import requests
from lxml import etree
from selenium import webdriver
import re
query = '王祖贤'
def download(src, title):
dir = 'G:\\爬虫涉及到的一些数据网址\\movespicture\\' + str(title)+'.jpg'
try:
pic = requests.get(src, timeout=10)
except requests.exceptions.ConnectionError:
print("电影图片无法加载")
with open(dir, 'wb') as fp:
fp.write(pic.content)
fp.close()
src_path = "//*[@class='item-root']/a/img/@src"
title_path = "//*[@class='item-root']/div[@class='detail']/div[@class='title']/a/text()"
# title_pathd=re.sub('?','',title_path)
for i in range(0, 6 * 15, 15):
request_url = "https://movie.douban.com/subject_search?search_text=" + str(query) + "&cat=1002&start=" + str(i)
driver = webdriver.Chrome("C:\SoftWare\Google\Chrome\Application\chromedriver.exe")
driver.get(request_url)
html = etree.HTML(driver.page_source)
srcs = html.xpath(src_path)
titles = html.xpath(title_path)
for src, title in zip(srcs, titles):
# title.encode('gbk', 'ignore').decode('gbk')
title = '_'.join(title.split())
title = title.replace('?', '')
print(src, title)
download(src, title)
中间我们遇到了一个问题
我们通过在程序中添加两行代码解决
title = '_'.join(title.split())
title = title.replace('?', '')
第一行代码的意思是title中按空格分割,然后使用_替换
第二行代码的意思是将title中的特殊字符?使用空格代替
大佬解释原因是因为:我们给一个文件或者一个图片命名的使用不能使用特殊字符。