中文的处理

import HTMLParser
html_parser = HTMLParser.HTMLParser()
s = html_parser.unescape('& # 23567; & # 23376;')
print(s)

#coding:utf-8
import urllib
import re
from urllib import quote
from uConvert import parser as u

def getWeiboInfo(keyword):
    webcontent = urllib.urlopen(url).read()
    
    webcontent = webcontent.replace('\\"','"')
    webcontent = webcontent.replace('\\n','')
    webcontent = webcontent.replace('\\/','/')
    
    txts = re.findall('<dd class="content">([\s\S]*?)<p class="info W_linkb W_textb">',webcontent)
    
    print repr(txts)    
    
    txtInfo = ""
    for txt in txts:
        imageList = []
        imageList += re.findall('<img class="" src="(.*?)"',txt)
        imageList += re.findall('<img class="bigcursor" src="(.*?)"',txt)
        
        if imageList:
            print "Images Found..."
            strImageList = " ".join(imageList).replace("/square/","/bmiddle/")
            strImageList = strImageList.replace("/thumbnail/","/bmiddle/")
        else:
            strImageList = ""
            print "No Image..."
            
        imageList = strImageList.split(" ")
        
        txtInfo += u(txt).encode("utf-8") +"\n"+"\n".join(imageList)+"\n"
        
    f = open("result.inc",'w')   
    f.writelines(txtInfo)
    f.close()

STATE_NORMAL = -1
STATE_SLASH = 0
STATE_BEGIN = 1
STATE_UNICODE = 2

def parser(inpstr):
    result = ''
    state = STATE_NORMAL
    counter = 0
    unicode_hex = ''

    for char in inpstr:
        if char == '\\' and state == STATE_NORMAL:
            state = STATE_SLASH
        elif char == 'u' and state == STATE_SLASH:
            state = STATE_BEGIN
        elif state == STATE_BEGIN:
            state = STATE_UNICODE
            counter = 1
            unicode_hex = char
        elif state == STATE_UNICODE:
            if counter < 4:
                if char == '\\':
                    result += '\\u' + unicode_hex
                    state = STATE_SLASH
                else:
                    unicode_hex += char
                    counter += 1
            if counter == 4:
                try:
                    result += unichr(int(unicode_hex, 16))
                except:
                    result += '\\u' + unicode_hex
                state = STATE_NORMAL
        else:
            result += char

    if state >= STATE_SLASH:
        result += '\\'
    if state >= STATE_BEGIN:
        result += 'u'
    if state == STATE_UNICODE:
        result += unicode_hex
    return result

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值