抓取的代码

a_template.py
# coding=utf-8 #

import re
import urllib
import string
import os
import time
import sys
import codecs
from urlparse import *

#thirdparty lib files
from BeautifulSoup import *

#my lib files
from lib.htmlproc import *

class ArticleTemplate():
  url = 'http://url'
  content = ""
  article_list = []
  delay = 0
  page = ""
  vcodec = "gbk"
  site = 'site name'
  atype = 'aritcle type'
  soup = None

  def __init__(self):
    self.article_list = []
    return

  #set self.url
  def setBaseUrl(self, url):
    self.url = url
    return

  #find the article list from page
  def findArticleListPage(self, soup):
    return soup.findAll('dl', {'class':'nl_hd clearfix'})

  #parse the one article item that from list, [url, title]
  def parseArticleListItem(self, item_soup):
    item = [unicode(item_soup.dd.h4.a.string), item_soup.dd.h4.a['href']]
    return item

  # get article list's article's url and title
  # return:article_list [[url, title]]
  def getArticleList(self, cur_url):
    page = urllib.urlopen(cur_url).read().decode(self.vcodec)

    #find the list content
    soup = BeautifulSoup(page)
    list_page = self.findArticleListPage(soup)

    #iterate the list and parse the item
    for item_page in list_page:
      try:
        item_soup = BeautifulSoup(str(item_page))
        tmp = self.parseArticleListItem(item_soup)
        if None == tmp:
          continue
        #add to article_list
        self.article_list.append(tmp)
      except:
        pass

    return self.article_list

  #find the page
  def findNextPage(self, soup):
    return soup.find('a', {'class':'next'})

  #get next page for article list
  def getNextListPage(self, cur_url):
    url = ""
    page = urllib.urlopen(cur_url).read().decode(self.vcodec)
    soup = BeautifulSoup(page)
    tmp = self.findNextPage(soup)
    if None != tmp:
      if 'href' != tmp.attrs[0][0]:
        return None
      url = tmp.attrs[0][1]
    else:
      return None
    return url

  #find next article content page
  def findNextArticlePage(self, soup):
    return soup.find('a', {'class':'bottom'})

  #get next page for article content
  def getNextArticlePage(self, url, page):
    soup = BeautifulSoup(page)
    tmp = self.findNextArticlePage(soup)
    if None == tmp:
      return None
    return urljoin(url, tmp['href'])    

  #find article content
  def findArticleContent(self, soup):
    return soup.find('div', {'id':'cotent_idd'})

  def formatContent(self, content):
    return getPNode(content)

  #get article content
  def getArticleContent(self, url):
    next_url = ""
    content = ""
    page = urllib.urlopen(url).read().decode(self.vcodec)
    soup = BeautifulSoup(page)
    tmp1 = self.findArticleContent(soup)

    if None == tmp1:
      return None

    next_url = self.getNextArticlePage(url, page)
    if None != next_url and '' != next_url:
      print 'read content url:',next_url
      content = content + self.getArticleContent(next_url)
    if None != tmp1 and '' != tmp1:
      content = tmp1.__str__() + content

    return self.formatContent(content)


zol_pad.py
import sys
import codecs

from urlparse import *

from BeautifulSoup import * 

from lib.htmlproc import *
from spider.article.a_template import *


class ArticleZolPad(ArticleTemplate):
  site = '中关村在线'
  atype = 'androidpad'

  #find the article list from page
  def findArticleListPage(self, soup):
    return soup.findAll('dl', {'class':'nl_hd clearfix'})

  #parse the one article item that from list, [url, title]
  def parseArticleListItem(self, item_soup):
    item = [unicode(item_soup.dd.h4.a.string), item_soup.dd.h4.a['href']]
    return item

  #find the page
  def findNextPage(self, soup):
    return soup.find('a', {'class':'next'})

  #find next article content page
  def findNextArticlePage(self, soup):
    return soup.find('a', {'class':'bottom'})

  #find article content
  def findArticleContent(self, soup):
    return soup.find('div', {'id':'cotent_idd'})
  




url = ""
class ArticleZolPad1():
    def __init__(self):
        self.url = 'http://pad.zol.com.cn/more/2_1531.shtml'
        self.content = ""
        self.article_list = []
        self.delay = 0
        self.page = ""
        self.vcodec = "gbk"
        self.site = '中关村在线'
        self.iname = 'androidpad'
        return

    # 获取列表页上的文章标题和url
    # 返回值:newslist   [[url, title]]
    def getNewsList(self):
        count = 0
        newslist = []
        soup = BeautifulSoup(self.page)
        tmp1 = soup.findAll('dl', {'class':'nl_hd clearfix'})
        for item in tmp1:
            news = BeautifulSoup(str(item))
            tmp = [unicode(news.dd.h4.a.string), news.dd.h4.a['href']]
            self.article_list.append(tmp)
            newslist.append(tmp)
            count = count + 1
        return newslist


    # 获取下一个列表页
    def getNextPage(self):
        soup = BeautifulSoup(self.page)
        tmp = soup.find('a', {'class':'next'})
        if None != tmp:
            if 'href' != tmp.attrs[0][0]:
                return None
            return tmp.attrs[0][1]
        return None

    # 获取文章页的下一页
    def getNewsContentNextPage(self, page):
        soup = BeautifulSoup(page)
        tmp1 = soup.find('a', {'class':'bottom'})
        if None == tmp1:
            return None
        return urljoin(self.url, tmp1['href'])

    # 获取文章页的内容
    def __getNewsContent(self, url):
        next_url = ""
        content = ""
        page = urllib.urlopen(url).read().decode('gbk')
        soup = BeautifulSoup(page)
        tmp1 = soup.find('div', {'id':'cotent_idd'})

        if None == tmp1:
            return None

        next_url = self.getNewsContentNextPage(page)
        if None != next_url and '' != next_url:
            print 'read content url:',next_url
            content = content + self.getNewsContent(next_url)
            #content = formatUrl(tmp1.__str__('gbk'), url) + content
        #content = removeUrls(tmp1.__str__('gbk')) + content
        if None != tmp1 and '' != tmp1:
            content = tmp1.__str__() + content
            #return tmp1.__str__('gbk')
        return content

    def getNewsContent(self, url):
        content = self.__getNewsContent(url)
        content = getPNode(content)
        content = removeUrls(content)
        content = formatImg(content)
        return content



  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值