爬取书法网站的草书图片
书法网站的链接为http://www.shufazidian.com/
import requests
from bs4 import BeautifulSoup
import os
def get_page(url,word):
try:
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0)like Gecko Core/1.70.3704.400 QQBrowser/10.4.3587.400",
"referer": "http://www.shufazidian.com/",
"Accept": "text/html, application/xhtml+xml, image/jxr, */*",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN",
"Cache-Control": "no-cache",
"Connection": "Keep-Alive",
"Content-Length": "19",
"Content-Type": "application/x-www-form-urlencoded",
"Cookie": "cookiesort=7; Hm_lvt_5ac259f575081df787744e91bb73f04e=1563974376,1564218809; Hm_lpvt_5ac259f575081df787744e91bb73f04e=1564226330",
"Host": "www.shufazidian.com"
}
data = {
'wd': word,
'sort': 7
}
r = requests.post(url, headers= headers,data= data) # post请求
r.encoding = r.apparent_encoding
r.raise_for_status()
return r.content
except:
return ""
def parse_page(html):
soup = BeautifulSoup(html ,"lxml") #解析网页
pics = soup.find_all(class_="mbpho") #获得图片所在的标签
pic_link = list()
name = list()
for i in range(1,len(pics)):
pic = pics[i].find(name="a").find(name="img")["src"] #获得图片的链接并存入列表
pic_link.append(pic)
title = pics[i].find(name="a")["title"] #获得图片的作者并存入列表
name.append(title)
pic_dic = dict(zip(pic_link,name)) #构造图片和作者一一对应的字典
return pic_dic
#print(pic_dic)
def to_file(url,word):
if not os.path.exists("E://shufa"): #创建书法目录
os.mkdir("E://shufa")
path = "E://shufa//"+word #创建搜索图片目录
if not os.path.exists(path):
os.mkdir(path)
os.chdir(path) #改变当前工作目录到path
html = get_page(url, word) #获得网页的html
pic_dic = parse_page(html) #解析网页html,返回图片链接和图片作者对应的字典
#print(pic_dic)
header = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0)like Gecko Core/1.70.3704.400 QQBrowser/10.4.3587.400",
"Cookie": "cookiesort=7; Hm_lvt_5ac259f575081df787744e91bb73f04e=1563974376,1564218809; Hm_lpvt_5ac259f575081df787744e91bb73f04e=1564226330"
}
for item in pic_dic:
#url = item
try:
response = requests.get(item, headers=header)
if response.status_code == 200:
open(pic_dic.get(item) + ".jpg", 'wb').write(response.content)
print("{} 保存成功".format(pic_dic.get(item)))
except:
return ''
def main ():
url = "http://www.shufazidian.com/"
words = ["刘","陶","林","张","任","爱","你","我","草","书"]
for word in words:
to_file(url,word)
"""def main():
url = "http://www.shufazidian.com/"
words = ["刘","陶","林"]
link = list()
name = list()
i=0
if not os.path.exists("E://shufa"):
os.mkdir("E://shufa")
for word in words:
html = get_page(url,word)
pic_dic = parse_page(html)
path = "E://shufa//" + word
if not os.path.exists(path):
os.mkdir(path)
os.chdir(path)
print(word)
for item in pic_dic:
#url = item
print(str(url))
header = {
'User - Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
}
response = requests.get(item,headers=header)
if response.status_code == 200:
with open(pic_dic.get(item)+".jpg",'wb') as f:
f.write(response.content)
print("保存成功")
link.append(item)
i = i+1
name.append(pic_dic.get(item))
print(name)"""
#print(pic_dic)
if __name__ == '__main__':
main()