python简单小爬虫_python 简易小爬虫

此脚本用于爬站点的下载链接,最终输出到txt文档中。

如果是没有防盗链设置的站点,也可以使用脚本中的下载函数尝试直接下载。

本脚本是为了短期特定目标设计的,如果使用它爬其它特征的资源链接需自行修改配置语句。

python初学者,请多多指正。

# -*- coding: utf-8 -*-

import re

import urllib

import os

import urllib2

import requests

import time

#download the file

def download(page, url):

local_filename =url.split('/')[-1] + page + '.jpg'

r = requests.get(url, stream=True)

with open(local_filename, 'wb') as f:

for chunk in r.iter_content(chunk_size = 1024):

if chunk: # filter out keep-alive new chunks

f.write(chunk)

f.flush()

return local_filename

#turn the data array into urls array

def print_urls(urls):

output_urls = []

for link in urls:

start_link = link.find('"')

end_link = link.find('"', start_link+1)

output_link = link[start_link+1: end_link]

if output_link.find('http') == -1:

output_link = 'http://www.XXX.com' + output_link

if link.count('"') > 2:

continue

else:

output_urls.append(output_link)

return output_urls

def output_download_link_page(page):

url = page

s = urllib.urlopen(url).read()

urls = []

img_urls = 'no image on' + page

new_stl_urls = []

title = re.findall(r'

.+', s, re.I)

if len(title) != 0:

title = title[0]

else:

title = 'no title'

img_urls = print_urls(re.findall(r'href=".*?\.jpg.*?"', s, re.I))

if len(img_urls) != 0:

img_urls = img_urls[0]

else:

img_urls = 'no image' + page

stl_urls = print_urls (set(re.findall(r'href="/download/.*?"', s, re.I)))

for url in stl_urls:

#url = urllib2.urlopen(url).url

url = requests.get(url).url

new_stl_urls.append(url)

urls.append(title)

urls.append(img_urls)

urls = urls + new_stl_urls

return urls

#print output_download_link_page('http://www.XXX.com/thing/46876')

#output all links to download

def output_all_pages(site):

s = urllib.urlopen(site).read()

page = re.findall(r'href="/thing/.*?"', s, re.I)

page = set(page)

return print_urls(page)

#output all the sites to download

def generate_sites(start, end):

sites = []

for num in range(start, end):

sites.append('http://www.XXX.com/popular?query=&pg=' + str(num))

return sites

#write all the results to a txt file

file_new = open ('1.txt', 'r+')

url_pakage = []

sites = generate_sites(40, 46)

count = 0

for site in sites:

print site

file_new.write( '\n' + site)

pages = output_all_pages(site)

for page in pages:

urls = output_download_link_page(page)

#

if len(urls) >= 10:

continue

count = count + 1

for url in urls:

file_new.write(url + '\n')

print 'done'

time.sleep(10)

file_new.close()

print 'all done. all..' + str(count) + '..models'

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值