一、功能描述
通过指定新浪博客主页url,自动爬取目录列表中所有文章的文字。保存所有文章的url,
二、完整代码
reptile.py
#!/usr/bin/python
#coding=utf-8
# Script: reptile.py
# Author: charlotte
# Date: 2016.4.20
#
# Platform:python
import urllib
import os
import artical_content
#use blog homeblog(reptile url),get blog dir_url(url)
filename = 'url_file'
rep_url = 'http://blog.sina.com.cn/twocold'
content = urllib.urlopen(rep_url).read()
bloginfo = content.find(r'blognavInfo')
dir_href = content.find(r'
dir_html = content.find(r'.html',dir_href)
url = content[dir_href+10:dir_html+5]
# get all blog article,max 20page
j = 0
while j<20:
content = urllib.urlopen(url).read()
title = content.find(r'