此脚本用于爬站点的下载链接,最终输出到txt文档中。
如果是没有防盗链设置的站点,也可以使用脚本中的下载函数尝试直接下载。
本脚本是为了短期特定目标设计的,如果使用它爬其它特征的资源链接需自行修改配置语句。
python初学者,请多多指正。
# -*- coding: utf-8 -*-
import re
import urllib
import os
import urllib2
import requests
import time
#download the file
def download(page, url):
local_filename =url.split('/')[-1] + page + '.jpg'
r = requests.get(url, stream=True)
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size = 1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
f.flush()
return local_filename
#turn the data array into urls array
def print_urls(urls):
output_urls = []
for link in urls:
start_link = link.find('"')
end_link = link.find('"', start_link+1)
output_link = link[start_link+1: end_link]
if output_link.find('http') == -1:
output_link = 'http://www.XXX.com' + output_link
if link.count('"') > 2:
continue
else:
output_urls.append(output_link)
return output_urls
def output_download_link_page(page):
url = page
s = urllib.urlopen(url).read()
urls = []
img_urls = 'no image on' + page
new_stl_urls = []
title = re.findall(r'
.+', s, re.I)
if len(title) != 0:
title = title[0]
else:
title = 'no title'
img_urls = print_urls(re.findall(r'href=".*?\.jpg.*?"', s, re.I))
if len(img_urls) != 0:
img_urls = img_urls[0]
else:
img_urls = 'no image' + page
stl_urls = print_urls (set(re.findall(r'href="/download/.*?"', s, re.I)))
for url in stl_urls:
#url = urllib2.urlopen(url).url
url = requests.get(url).url
new_stl_urls.append(url)
urls.append(title)
urls.append(img_urls)
urls = urls + new_stl_urls
return urls
#print output_download_link_page('http://www.XXX.com/thing/46876')
#output all links to download
def output_all_pages(site):
s = urllib.urlopen(site).read()
page = re.findall(r'href="/thing/.*?"', s, re.I)
page = set(page)
return print_urls(page)
#output all the sites to download
def generate_sites(start, end):
sites = []
for num in range(start, end):
sites.append('http://www.XXX.com/popular?query=&pg=' + str(num))
return sites
#write all the results to a txt file
file_new = open ('1.txt', 'r+')
url_pakage = []
sites = generate_sites(40, 46)
count = 0
for site in sites:
print site
file_new.write( '\n' + site)
pages = output_all_pages(site)
for page in pages:
urls = output_download_link_page(page)
#
if len(urls) >= 10:
continue
count = count + 1
for url in urls:
file_new.write(url + '\n')
print 'done'
time.sleep(10)
file_new.close()
print 'all done. all..' + str(count) + '..models'