1、使用python下载百思不得姐网站的图片
#-*- coding:UTF-8 -*- #编码设置
#下载百思不得姐网站图片
#引入依赖包
import requests
import re
import urllib
import os.path
#请求获取网站资源
def get_response(url):
response = requests.get(url).content
return response
#根据正则获取指定内容
def get_content(html):
reg = re.compile(r'(<div class="j-r-list-c">.*?</div>.*?</div>)',re.S)
return re.findall(reg,html)
#根据正则获取图片资源
def get_image_url(response):
reg = r'data-original="(.*?)"'
return re.findall(reg,response)
#根据正则获取文件名称
def get_image_name(response):
reg = re.compile('<a href="/detail-.{8}.html">(.*?)</a>')
return re.findall(reg,response)
#下载图片
def download_image(image_url,path):
#将名字去空格
path = ''.join(path.split())
#将图片路径先解码再编码,动态获取文件后缀,拼接成新的图片名称
path = 'D:\\xx\\{}'.format(path.decode('utf-8').encode('gbk'))+'.'+os.path.splitext(image_url.decode('utf-8').encode('gbk'))[1]
#保存到本地
urllib.urlretrieve(image_url, path)
def get_url_name(start_url):
print start_url
content = get_content(get_response(start_url))
for i in content:
image_url = get_image_url(i)
if image_url:
image_name = get_image_name(i)
#print image_url[0],image_name[0]
try:
download_image(image_url[0],image_name[0])
except:
print 'download error:'+image_url
continue
#循环下载
def main():
[get_url_name(start_url) for start_url in start_urls]
#执行入口
if __name__ == '__main__':
start_urls = ['http://www.budejie.com/{}'.format(i) for i in range(1,10)]
main()
2、读取段子
#-*- coding:UTF-8 -*- #编码设置
#下载百思不得姐网站段子
#引入依赖包
import requests
import re
import time
#请求获取网站资源
def get_response(url):
response = requests.get(url).content
return response
#根据正则获取指定内容
def get_content(html):
reg = re.compile(r'(<div class="j-r-list-c">.*?</div>.*?</div>)',re.S)
return re.findall(reg,html)
#根据正则获取文件名称
def get_image_name(response):
reg = re.compile('<a href="/detail-.{1,8}.html">(.*?)</a>')
return re.findall(reg,response)
def get_url_name(start_url):
print start_url
#睡眠一下,防止被网站视为攻击
time.sleep(1)
content = get_content(get_response(start_url))
for i in content:
image_name = get_image_name(i)
if image_name:
#print image_url[0],image_name[0]
print image_name[0]
#循环读取
def main():
[get_url_name(start_url) for start_url in start_urls]
#执行入口
if __name__ == '__main__':
start_urls = ['http://www.budejie.com/text/{}'.format(i) for i in range(1,20)]
main()