python简单爬取一个blogs内容

最新推荐文章于 2024-08-12 09:07:37 发布

jamst8522127

最新推荐文章于 2024-08-12 09:07:37 发布

阅读量118

点赞数

分类专栏： PYTHON 爬虫文章标签： python

本文链接：https://blog.csdn.net/jamst8522127/article/details/84912087

版权

PYTHON 同时被 2 个专栏收录

5 篇文章 0 订阅

订阅专栏

爬虫

2 篇文章 0 订阅

订阅专栏


# -*- coding: utf-8 -*-

from urllib2 import urlopen,Request

import urllib

from lxml import *

import lxml.html as HTML

import time

def error(txt):

    with open("../it/error.txt","a") as f:

        f.write(txt + '\n')

def con(url,count=4):

    try:

        req = Request(url)

        req.add_header('Referer','http://www.baidu.com')

        req.add_header('User-Agent','Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')

        res = urlopen(req,timeout = 20)

        page = res.read()

        res.close()

        #dom = HTML.document_fromstring(page)

        return page

    except Exception,e:

        if count >= 10:

            print e

            error(url)

        else:

            count += 1

            time.sleep(1)

            return con(url,count)

def menu(url):

    page = con(url)

    dom = HTML.document_fromstring(page)

    path = "//h5/a"

    node = dom.xpath(path)

    for n in node:

        dic = {}

        dic['title'] = n.text_content()

        dic['url'] = "http:" + n.get("href")

        if dic['title'] and dic['url']:

            yield dic

def save(title,content):

    with open('../it/'+unicode(title)+'.html','w') as f:

        f.write(content)

def blog():

    prev = menu("http://www.schooltop.net")

    for dic in prev:

        title = dic.get("title",'')

        url = dic.get("url",'')

        page = con(url)

        save(title,page)

        print "saved      ",unicode(title)



if __name__ == "__main__":

##    try:

        blog()

##    except Exception,e:

##        print e

方法二：


import urllib2
import re  
arr = ['289','300']
for i in arr:
  content = urllib2.urlopen('http://www.schooltop.net/blogs/'+i).read()
  pattern = re.compile('<div class="article">(.*?)<div class="row t_margin_20">', re.S)
  match = re.search(pattern, content)
  if match:
    print match.group(1)
  else: 
    print 111

jamst8522127

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python简单爬取一个blogs内容

[code="python"]# -*- coding: utf-8 -*-from urllib2 import urlopen,Requestimport urllibfrom lxml import *import lxml.html as HTMLimport timedef error(txt): with open("....
复制链接

扫一扫

专栏目录