中文的处理_thundor中文是什么意思-CSDN博客

本文链接：https://blog.csdn.net/thundor/article/details/8201038

import HTMLParser
html_parser = HTMLParser.HTMLParser()
s = html_parser.unescape('& # 23567; & # 23376;')
print(s)

#coding:utf-8
import urllib
import re
from urllib import quote
from uConvert import parser as u

def getWeiboInfo(keyword):
    webcontent = urllib.urlopen(url).read()
    
    webcontent = webcontent.replace('\\"','"')
    webcontent = webcontent.replace('\\n','')
    webcontent = webcontent.replace('\\/','/')
    
    txts = re.findall('<dd class="content">([\s\S]*?)<p class="info W_linkb W_textb">',webcontent)
    
    print repr(txts)    
    
    txtInfo = ""
    for txt in txts:
        imageList = []
        imageList += re.findall('<img class="" src="(.*?)"',txt)
        imageList += re.findall('<img class="bigcursor" src="(.*?)"',txt)
        
        if imageList:
            print "Images Found..."
            strImageList = " ".join(imageList).replace("/square/","/bmiddle/")
            strImageList = strImageList.replace("/thumbnail/","/bmiddle/")
        else:
            strImageList = ""
            print "No Image..."
            
        imageList = strImageList.split(" ")
        
        txtInfo += u(txt).encode("utf-8") +"\n"+"\n".join(imageList)+"\n"
        
    f = open("result.inc",'w')   
    f.writelines(txtInfo)
    f.close()

STATE_NORMAL = -1
STATE_SLASH = 0
STATE_BEGIN = 1
STATE_UNICODE = 2

def parser(inpstr):
    result = ''
    state = STATE_NORMAL
    counter = 0
    unicode_hex = ''

    for char in inpstr:
        if char == '\\' and state == STATE_NORMAL:
            state = STATE_SLASH
        elif char == 'u' and state == STATE_SLASH:
            state = STATE_BEGIN
        elif state == STATE_BEGIN:
            state = STATE_UNICODE
            counter = 1
            unicode_hex = char
        elif state == STATE_UNICODE:
            if counter < 4:
                if char == '\\':
                    result += '\\u' + unicode_hex
                    state = STATE_SLASH
                else:
                    unicode_hex += char
                    counter += 1
            if counter == 4:
                try:
                    result += unichr(int(unicode_hex, 16))
                except:
                    result += '\\u' + unicode_hex
                state = STATE_NORMAL
        else:
            result += char

    if state >= STATE_SLASH:
        result += '\\'
    if state >= STATE_BEGIN:
        result += 'u'
    if state == STATE_UNICODE:
        result += unicode_hex
    return result