唯美少女网页爬虫源码
# -*- coding: utf-8 -*-
"""
@author: tanderick
"""
import requests
import re
import time
import os
import urllib.parse
filepath = str(r'C:/ml/data/image/vm/')
if not os.path.exists(filepath):
os.mkdir(filepath)
keyword = '校园'#搜索名词
kw=urllib.parse.quote(keyword)
headers ={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0'
}
t = requests.get('https://www.vmgirls.com/?s='+kw,headers=headers)
html = t.text
m_urls = re.findall('<a href="(.*?)" title="(.*?)" ',html)
for m_url,name in m_urls:
t = requests.get(m_url,headers=headers)
html = t.text
urls = re.findall('http(.*?)jpg',html)
filepath1 = str(filepath+name+'/')
if not os.path.exists(filepath1):
os.mkdir(filepath1)
time.sleep(5)
for url in urls:
url = urllib.parse.unquote(str('http'+url+'jpg'))
filename = url.split('/')[-1]
response = requests.get(url,headers=headers)
with open(filepath1+filename,'wb') as f:
f.write(response.content)
print(url+'下载完成')
这个是优化了一下结构的,看着舒服点
# -*- coding: utf-8 -*-
"""
@author: tanderick
"""
import requests
import re
import time
import os
import urllib.parse
#调用包
def search(keyword,headers,filepath):
kw=urllib.parse.quote(keyword)
t = requests.get('https://www.vmgirls.com/?s='+kw,headers=headers)
m_html = t.text
m_urls = re.findall('<a href="(.*?)" title="(.*?)" ',m_html)
if not os.path.exists(filepath):
os.mkdir(filepath)
return(m_urls)
#搜索内容并返回二级目录网址
def download(urls,filepath1):
for url in urls:
url = urllib.parse.unquote(str('http'+url+'jpg'))
filename = url.split('/')[-1]
response = requests.get(url,headers=headers)
with open(filepath1+filename,'wb') as f:
f.write(response.content)
print(url+'下载完成')
time.sleep(1)
return
#命名下载图片
def get_url(m_urls,filepath,headers):
for m_url,name in m_urls:
t = requests.get(m_url,headers=headers)
html = t.text
urls = re.findall('http(.*?)jpg',html)
filepath1 = str(filepath+name+'/')
if not os.path.exists(filepath1):
os.mkdir(filepath1)
download(urls,filepath1)
return(urls,filepath1)
#需找图片下载地址
if __name__ == '__main__':
keyword = '校园'#搜索名词
filepath = str(r'C:/ml/data/image/vm/')
headers ={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0'
}
m_urls = search(keyword,headers,filepath)
get_url(m_urls,filepath,headers)
#主程序