crifanlib java_crifanLib.py

#!/usr/bin/python

# -*- coding: utf-8 -*-

"""

[Filename]

crifanLib.py

[Function]

crifan's common functions, implemented by Python.

[Note]

1. install chardet and BeautifulSoup before use this crifanLib.

[TODO]

1. use htmlentitydefs instead of mannually made html entity table

[History]

[v2.3]

1. add removeSoupContentsTagAttr, findFirstNavigableString, soupContentsToUnicode

[v2.0]

1. add tupleListToDict

[v1.9]

1.add randDigitsStr

[v1.8]

1.bugfix-> isFileValid support unquoted & lower for compare filename

[v1.7]

1.bugfix-> isFileValid support quoted & lower for compare filename

[v1.6]

1.add getCurTimestamp

[v1.5]

1.add timeout for all urllib2.urlopen to try to avoid dead url link

[v1.4]

1.add support overwrite header for getUrlResponse

2.add gzip support for getUrlResponse and getUrlRespHtml

"""

__author__ = "Crifan Li (admin@crifan.com)"

#__version__ = ""

__copyright__ = "Copyright (c) 2012, Crifan Li"

__license__ = "GPL"

import os;

import re;

import sys;

import time;

import chardet;

import urllib;

import urllib2;

from datetime import datetime,timedelta;

import bs4

from bs4 import BeautifulSoup

#from BeautifulSoup import BeautifulSoup,Tag,CData;

import logging;

#import htmlentitydefs;

import struct;

import zlib;

import random;

# from PIL import Image;

# from operator import itemgetter;

#--------------------------------const values-----------------------------------

__VERSION__ = "v2.3";

gConst = {

'userAgentIE9' : 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)',

# also belong to ContentTypes, more info can refer: http://kenya.bokee.com/3200033.html

# here use Tuple to avoid unexpected change

# note: for tuple, refer item use tuple[i], not tuple(i)

'picSufList' : ('bmp', 'gif', 'jpeg', 'jpg', 'jpe', 'png', 'tiff', 'tif'),

'defaultTimeout': 20, # default timeout seconds for urllib2.urlopen

}

#----------------------------------global values--------------------------------

gVal = {

'calTimeKeyDict' : {},

'picSufChars' : '', # store the pic suffix char list

'currentLevel' : 0,

}

#### some internal functions ###

#------------------------------------------------------------------------------

# generate the suffix char list according to constont picSufList

def genSufList() :

global gConst;

sufChrList = [];

for suffix in gConst['picSufList'] :

for c in suffix :

sufChrList.append(c);

sufChrList = uniqueList(sufChrList);

sufChrList.sort();

joinedSuf = ''.join(sufChrList);

swapedSuf = [];

swapedSuf = joinedSuf.swapcase();

wholeSuf = joinedSuf + swapedSuf;

return wholeSuf;

################################################################################

# Time

################################################################################

#------------------------------------------------------------------------------

# get current time's timestamp

def getCurTimestamp() :

return datetimeToTimestamp(datetime.now());

#------------------------------------------------------------------------------

# convert datetime value to timestamp

# from "2006-06-01 00:00:00" to 1149091200

def datetimeToTimestamp(datetimeVal) :

return int(time.mktime(datetimeVal.timetuple()));

#------------------------------------------------------------------------------

# convert timestamp to datetime value

# from 1149091200 to "2006-06-01 00:00:00"

def timestampToDatetime(timestamp) :

#print "type(timestamp)=",type(timestamp);

#print "timestamp=",timestamp;

#timestamp = int(timestamp);

timestamp = float(timestamp);

return datetime.fromtimestamp(timestamp);

#------------------------------------------------------------------------------

#init for calculate elapsed time

def calcTimeStart(uniqueKey) :

global gVal

gVal['calTimeKeyDict'][uniqueKey] = time.time();

return

#------------------------------------------------------------------------------

# to get elapsed time, before call this, should use calcTimeStart to init

def calcTimeEnd(uniqueKey) :

global gVal

return time.time() - gVal['calTimeKeyDict'][uniqueKey];

#------------------------------------------------------------------------------

# convert local GMT8 to GMT time

# note: input should be 'datetime' type, not 'time' type

def convertLocalToGmt(localTime) :

return localTime - timedelta(hours=8);

################################################################################

# String

################################################################################

#------------------------------------------------------------------------------

# generated the random digits number string

# max digit number is 12

def randDigitsStr(digitNum = 12) :

if(digitNum > 12):

digitNum = 12;

randVal = random.random();

#print "randVal=",randVal; #randVal= 0.134248340235

randVal = str(randVal);

#print "randVal=",randVal; #randVal= 0.134248340235

randVal = randVal.replace("0.", "");

#print "randVal=",randVal; #randVal= 0.134248340235

# if last is 0, append that 0

if(len(randVal)==11):

randVal = randVal + "0";

#print "randVal=",randVal; #randVal= 0.134248340235

#randVal = randVal.replace("e+11", "");

#randVal = randVal.replace(".", "");

#print "randVal=",randVal; #randVal= 0.134248340235

randVal = randVal[0 : digitNum];

#print "randVal=",randVal; #randVal= 0.134248340235

return randVal;

#------------------------------------------------------------------------------

# get supported picture suffix list

def getPicSufList():

return gConst['picSufList'];

#------------------------------------------------------------------------------

# get supported picture suffix chars

def getPicSufChars():

return gVal['picSufChars'];

#------------------------------------------------------------------------------

# got python script self file name

# extract out xxx from:

# D:\yyy\zzz\xxx.py

# xxx.py

def extractFilename(inputStr) :

argv0List = inputStr.split("\\");

scriptName = argv0List[len(argv0List) - 1]; # get script file name self

possibleSuf = scriptName[-3:];

if possibleSuf == ".py" :

scriptName = scriptName[0:-3]; # remove ".py"

return scriptName;

#------------------------------------------------------------------------------

# replace the N; (N is digit number, N > 1) to unicode char

# eg: replace "'" with "'" in "Creepin' up on you"

def repUniNumEntToChar(text):

unicodeP = re.compile('[0-9]+;');

def transToUniChr(match): # translate the matched string to unicode char

numStr = match.group(0)[2:-1]; # remove '' and ';'

num = int(numStr);

unicodeChar = unichr(num);

return unicodeChar;

return unicodeP.sub(transToUniChr, text);

#------------------------------------------------------------------------------

# generate the full url, which include the main url plus the parameter list

# Note:

# normally just use urllib.urlencode is OK.

# only use this if you do NOT want urllib.urlencode convert some special chars($,:,{,},...) into %XX

def genFullUrl(mainUrl, paraDict) :

fullUrl = mainUrl;

fullUrl += '?';

for i, para in enumerate(paraDict.keys()) :

if(i == 0):

# first para no '&'

fullUrl += str(para) + '=' + str(paraDict[para]);

else :

fullUrl += '&' + str(para) + '=' + str(paraDict[para]);

return fullUrl;

#------------------------------------------------------------------------------

# check whether two url is similar

# note: input two url both should be str type

def urlIsSimilar(url1, url2) :

isSim = False;

url1 = str(url1);

url2 = str(url2);

slashList1 = url1.split('/');

slashList2 = url2.split('/');

lenS1 = len(slashList1);

lenS2 = len(slashList2);

# all should have same structure

if lenS1 != lenS2 :

# not same sturcture -> must not similar

isSim = False;

else :

sufPos1 = url1.rfind('.');

sufPos2 = url2.rfind('.');

suf1 = url1[(sufPos1 + 1) : ];

suf2 = url2[(sufPos2 + 1) : ];

# at least, suffix should same

if (suf1 == suf2) :

lastSlashPos1 = url1.rfind('/');

lastSlashPos2 = url2.rfind('/');

exceptName1 = url1[:lastSlashPos1];

exceptName2 = url2[:lastSlashPos2];

# except name, all other part should same

if (exceptName1 == exceptName2) :

isSim = True;

else :

# except name, other part is not same -> not similar

isSim = False;

else :

# suffix not same -> must not similar

isSim = False;

return isSim;

#------------------------------------------------------------------------------

# found whether the url is similar in urlList

# if found, return True, similarSrcUrl

# if not found, return False, ''

def findSimilarUrl(url, urlList) :

(isSimilar, similarSrcUrl) = (False, '');

for srcUrl in urlList :

if urlIsSimilar(url, srcUrl) :

isSimilar = True;

similarSrcUrl = srcUrl;

break;

return (isSimilar, similarSrcUrl);

#------------------------------------------------------------------------------

# remove non-word char == only retian alphanumeric character (char+number) and underscore

# eg:

# from againinput4@yeah to againinput4yeah

# from green-waste to greenwaste

def removeNonWordChar(inputString) :

return re.sub(r"[^\w]", "", inputString); # non [a-zA-Z0-9_]

#------------------------------------------------------------------------------

# remove control character from input string

# otherwise will cause wordpress importer import failed

# for wordpress importer, if contains contrl char, will fail to import wxr

# eg:

# 1. http://againinput4.blog.163.com/blog/static/172799491201110111145259/

# content contains some invalid ascii control chars

# 2. http://hi.baidu.com/notebookrelated/blog/item/8bd88e351d449789a71e12c2.html

# 165th comment contains invalid control char: ETX

# 3. http://green-waste.blog.163.com/blog/static/32677678200879111913911/

# title contains control char:DC1, BS, DLE, DLE, DLE, DC1

def removeCtlChr(inputString) :

validContent = '';

for c in inputString :

asciiVal = ord(c);

validChrList = [

9, # 9=\t=tab

10, # 10=\n=LF=Line Feed=换行

13, # 13=\r=CR=回车

];

# filter out others ASCII control character, and DEL=delete

isValidChr = True;

if (asciiVal == 0x7F) :

isValidChr = False;

elif ((asciiVal < 32) and (asciiVal not in validChrList)) :

isValidChr = False;

if(isValidChr) :

validContent += c;

return validContent;

#------------------------------------------------------------------------------

# remove ANSI control character: 0x80-0xFF

def removeAnsiCtrlChar(inputString):

validContent = '';

for c in inputString :

asciiVal = ord(c);

isValidChr = True;

if ((asciiVal >= 0x80) and (asciiVal <= 0xFF)) :

#if ((asciiVal >= 0xB0) and (asciiVal <= 0xFF)) : # test

isValidChr = False;

#print "asciiVal=0x%x"%asciiVal;

if(isValidChr) :

validContent += c;

return validContent;

#------------------------------------------------------------------------------

# convert the string entity to unicode unmber entity

# refer: http://www.htmlhelp.com/reference/html40/entities/latin1.html

# TODO: need later use this htmlentitydefs instead following

def replaceStrEntToNumEnt(text) :

strToNumEntDict = {

# Latin-1 Entities

" ": " ",

"¡": "¡",

"¢" : "¢",

"£": "£",

"¤": "¤",

"¥" : "¥",

"¦": "¦",

"§": "§",

"¨" : "¨",

"©": "©",

"ª": "ª",

"«": "«",

"¬" : "¬",

"­" : "­",

"®" : "®",

"¯": "¯",

"°" : "°",

"±": "±",

"²": "²",

"³": "³",

"´": "´",

"µ": "µ",

"¶": "¶",

"·": "·",

"¸": "¸",

"¹" : "¹",

"º" : "º",

"»": "»",

"¼": "¼",

"½": "½",

"¾": "¾",

"¿": "¿",

"À": "À",

"Á": "Á",

"Â": "Â",

"Ã": "Ã",

"Ä": "Ä",

"Å": "Å",

"Æ": "Æ",

"Ç": "Ç",

"È": "È",

"É": "É",

"Ê": "Ê",

"Ë" : "Ë",

"Ì": "Ì",

"Í": "Í",

"Î": "Î",

"Ï" : "Ï",

"Ð" : "Ð",

"Ñ": "Ñ",

"Ò": "Ò",

"Ó": "Ó",

"Ô": "Ô",

"Õ": "Õ",

"Ö": "Ö",

"×": "×",

"Ø": "Ø",

"Ù": "Ù",

"Ú": "Ú",

"Û": "Û",

"Ü": "Ü",

"Ý": "Ý",

"Þ": "Þ",

"ß": "ß",

"à": "à",

"á": "á",

"â": "â",

"ã": "ã",

"ä": "ä",

"å": "å",

"æ": "æ",

"ç": "ç",

"è": "è",

"é": "é",

"ê": "ê",

"ë": "ë",

"ì": "ì",

"í": "í",

"î": "î",

"ï": "ï",

"ð" : "ð",

"ñ": "ñ",

"ò": "ò",

"ó": "ó",

"ô": "ô",

"õ": "õ",

"ö" : "ö",

"÷": "÷",

"ø": "ø",

"ù": "ù",

"ú": "ú",

"û": "û",

"ü": "ü",

"ý": "ý",

"þ": "þ",

"ÿ": "ÿ",

# http://www.htmlhelp.com/reference/html40/entities/special.html

# Special Entities

""" : """,

"&" : "&",

"<" : "<",

">" : ">",

"Œ" : "Œ",

"œ" : "œ",

"Š" : "Š",

"š" : "š",

"Ÿ" : "Ÿ",

"ˆ" : "ˆ",

"˜" : "˜",

" " : " ",

" " : " ",

" " : " ",

"‌" : "‌",

"‍" : "‍",

"‎" : "‎",

"‏" : "‏",

"–" : "–",

"—" : "—",

"‘" : "‘",

"’" : "’",

"‚" : "‚",

"“" : "“",

"”" : "”",

"„" : "„",

"†" : "†",

"‡" : "‡",

"‰" : "‰",

"‹" : "‹",

"›" : "›",

"€" : "€",

}

replacedText = text;

for key in strToNumEntDict.keys() :

replacedText = re.compile(key).sub(strToNumEntDict[key], replacedText);

return replacedText;

#------------------------------------------------------------------------------

# convert the xxx=yyy into tuple('xxx', yyy), then return the tuple value

# [makesure input string]

# (1) is not include whitespace

# (2) include '='

# (3) last is no ';'

# [possible input string]

# blogUserName="againinput4"

# publisherEmail=""

# synchMiniBlog=false

# publishTime=1322129849397

# publisherName=null

# publisherNickname="\u957F\u5927\u662F\u70E6\u607C"

def convertToTupleVal(equationStr) :

(key, value) = ('', None);

try :

# Note:

# here should not use split with '=', for maybe input string contains string like this:

# http://img.bimg.126.net/photo/hmZoNQaqzZALvVp0rE7faA==/0.jpg

# so use find('=') instead

firstEqualPos = equationStr.find("=");

key = equationStr[0:firstEqualPos];

valuePart = equationStr[(firstEqualPos + 1):];

# string type

valLen = len(valuePart);

if valLen >= 2 :

# maybe string

if valuePart[0] == '"' and valuePart[-1] == '"' :

# is string type

value = str(valuePart[1:-1]);

elif (valuePart.lower() == 'null'):

value = None;

elif (valuePart.lower() == 'false'):

value = False;

elif (valuePart.lower() == 'true') :

value = True;

else :

# must int value

value = int(valuePart);

else :

# len=1 -> must be value

value = int(valuePart);

#print "Convert %s to [%s]=%s"%(equationStr, key, value);

except :

(key, value) = ('', None);

print "Fail of convert the equal string %s to value"%(equationStr);

return (key, value);

################################################################################

# List

################################################################################

#------------------------------------------------------------------------------

# remove the empty ones in list

def removeEmptyInList(list) :

newList = [];

for val in list :

if val :

newList.append(val);

return newList;

#------------------------------------------------------------------------------

# remove overlapped item in the list

def uniqueList(old_list):

newList = []

for x in old_list:

if x not in newList :

newList.append(x)

return newList

#------------------------------------------------------------------------------

# for listToFilter, remove the ones which is in listToCompare

# also return the ones which is already exist in listToCompare

def filterList(listToFilter, listToCompare) :

filteredList = [];

existedList = [];

for singleOne in listToFilter : # remove processed

if (not(singleOne in listToCompare)) :

# omit the ones in listToCompare

filteredList.append(singleOne);

else :

# record the already exist ones

existedList.append(singleOne);

return (filteredList, existedList);

#------------------------------------------------------------------------------

# convert tuple list to dict value

# [(u'type', u'text/javascript'), (u'src', u'http://partner.googleadservices.com/gampad/google_service.js')]

# { u'type':u'text/javascript', u'src':u'http://partner.googleadservices.com/gampad/google_service.js' }

def tupleListToDict(tupleList):

convertedDict = {};

for eachTuple in tupleList:

(key, value) = eachTuple;

convertedDict[key] = value;

return convertedDict;

################################################################################

# File

################################################################################

#------------------------------------------------------------------------------

# save binary data into file

def saveBinDataToFile(binaryData, fileToSave):

saveOK = False;

try:

savedBinFile = open(fileToSave, "wb"); # open a file, if not exist, create it

#print "savedBinFile=",savedBinFile;

savedBinFile.write(binaryData);

savedBinFile.close();

saveOK = True;

except :

saveOK = False;

return saveOK;

################################################################################

# Network: urllib/urllib2/http

################################################################################

#------------------------------------------------------------------------------

# check file validation:

# open file url to check return info is match or not

# with exception support

# note: should handle while the file url is redirect

# eg :

# http://publish.it168.com/2007/0627/images/500754.jpg ->

# http://img.publish.it168.com/2007/0627/images/500754.jpg

# other special one:

# sina pic url:

# http://s14.sinaimg.cn/middle/3d55a9b7g9522d474a84d&690

# http://s14.sinaimg.cn/orignal/3d55a9b7g9522d474a84d

# the real url is same with above url

def isFileValid(fileUrl) :

fileIsValid = False;

errReason = "Unknown error";

try :

#print "original fileUrl=",fileUrl;

origFileName = fileUrl.split('/')[-1];

#print "origFileName=",origFileName;

#old: https://ie2zeq.bay.livefilestore.com/y1mo7UWr-TrmqbBhkw52I0ii__WE6l2UtMRSTZHSky66-uDxnCdKPr3bdqVrpUcQHcoJLedlFXa43bvCp_O0zEGF3JdG_yZ4wRT-c2AQmJ_TNcWvVZIXfBDgGerouWyx19WpA4I0XQR1syRJXjDNpwAbQ/IMG_5214_thumb[1].jpg

#new: https://kxoqva.bay.livefilestore.com/y1mQlGjwNAYiHKoH5Aw6TMNhsCmX2YDR3vPKnP86snuqQEtnZgy3dHkwUvZ61Ah8zU3AGiS4whmm_ADrvxdufEAfMGo56KjLdhIbosn9F34olQ/IMG_5214_thumb%5b1%5d.jpg

unquotedOrigFilenname = urllib.unquote(origFileName);

#print "unquotedOrigFilenname=",unquotedOrigFilenname

lowUnquotedOrigFilename = unquotedOrigFilenname.lower();

#print "lowUnquotedOrigFilename=",lowUnquotedOrigFilename;

resp = urllib2.urlopen(fileUrl, timeout=gConst['defaultTimeout']); # note: Python 2.6 has added timeout support.

#print "resp=",resp;

realUrl = resp.geturl();

#print "realUrl=",realUrl;

newFilename = realUrl.split('/')[-1];

#print "newFilename=",newFilename;

#http://blog.sina.com.cn/s/blog_696e50390100ntxs.html

unquotedNewFilename = urllib.unquote(newFilename);

#print "unquotedNewFilename=",unquotedNewFilename;

unquotedLowNewFilename = unquotedNewFilename.lower();

#print "unquotedLowNewFilename=",unquotedLowNewFilename;

respInfo = resp.info();

#print "respInfo=",respInfo;

respCode = resp.getcode();

#print "respCode=",respCode;

# special:

# http://116.img.pp.sohu.com/images/blog/2007/5/24/17/24/11355bf42a9.jpg

# return no content-length

#contentLen = respInfo['Content-Length'];

# for redirect, if returned size>0 and filename is same, also should be considered valid

#if (origFileName == newFilename) and (contentLen > 0):

# for redirect, if returned response code is 200(OK) and filename is same, also should be considered valid

#if (origFileName == newFilename) and (respCode == 200):

if (lowUnquotedOrigFilename == unquotedLowNewFilename) and (respCode == 200):

fileIsValid = True;

else :

fileIsValid = False;

# eg: Content-Type= image/gif, ContentTypes : audio/mpeg

# more ContentTypes can refer: http://kenya.bokee.com/3200033.html

contentType = respInfo['Content-Type'];

errReason = "file url returned info: type=%s, len=%d, realUrl=%s"%(contentType, contentLen, realUrl);

except urllib2.URLError,reason :

fileIsValid = False;

errReason = reason;

except urllib2.HTTPError,code :

fileIsValid = False;

errReason = code;

except :

fileIsValid = False;

errReason = "Unknown error";

# here type(errReason)= , so just convert it to str

errReason = str(errReason);

return (fileIsValid, errReason);

#------------------------------------------------------------------------------

# download from fileUrl then save to fileToSave

# with exception support

# note: the caller should make sure the fileUrl is a valid internet resource/file

def downloadFile(fileUrl, fileToSave, needReport = False) :

isDownOK = False;

downloadingFile = '';

#---------------------------------------------------------------------------

# note: totalFileSize -> may be -1 on older FTP servers which do not return a file size in response to a retrieval request

def reportHook(copiedBlocks, blockSize, totalFileSize) :

#global downloadingFile

if copiedBlocks == 0 : # 1st call : once on establishment of the network connection

print 'Begin to download %s, total size=%d'%(downloadingFile, totalFileSize);

else : # rest call : once after each block read thereafter

print 'Downloaded bytes: %d' % ( blockSize * copiedBlocks);

return;

#---------------------------------------------------------------------------

try :

if fileUrl :

downloadingFile = fileUrl;

if needReport :

urllib.urlretrieve(fileUrl, fileToSave, reportHook);

else :

urllib.urlretrieve(fileUrl, fileToSave);

isDownOK = True;

else :

print "Input download file url is NULL";

except urllib.ContentTooShortError(msg) :

isDownOK = False;

except :

isDownOK = False;

return isDownOK;

#------------------------------------------------------------------------------

# manually download fileUrl then save to fileToSave

def manuallyDownloadFile(fileUrl, fileToSave) :

isDownOK = False;

downloadingFile = '';

try :

if fileUrl :

# 1. find real address

#print "fileUrl=",fileUrl;

resp = urllib2.urlopen(fileUrl, timeout=gConst['defaultTimeout']);

#print "resp=",resp;

realUrl = resp.geturl(); # not same with original file url if redirect

# if url is invalid, then add timeout can avoid dead

respHtml = getUrlRespHtml(realUrl, useGzip=False, timeout=gConst['defaultTimeout']);

isDownOK = saveBinDataToFile(respHtml, fileToSave);

else :

print "Input download file url is NULL";

except urllib.ContentTooShortError(msg) :

isDownOK = False;

except :

isDownOK = False;

return isDownOK;

#------------------------------------------------------------------------------

# get response from url

# note: if you have already used cookiejar, then here will automatically use it

# while using rllib2.Request

def getUrlResponse(url, postDict={}, headerDict={}, timeout=0, useGzip=False) :

# makesure url is string, not unicode, otherwise urllib2.urlopen will error

url = str(url);

if (postDict) :

postData = urllib.urlencode(postDict);

req = urllib2.Request(url, postData);

req.add_header('Content-Type', "application/x-www-form-urlencoded");

else :

req = urllib2.Request(url);

if(headerDict) :

#print "added header:",headerDict;

for key in headerDict.keys() :

req.add_header(key, headerDict[key]);

defHeaderDict = {

'User-Agent' : gConst['userAgentIE9'],

'Cache-Control' : 'no-cache',

'Accept' : '*/*',

'Connection' : 'Keep-Alive',

};

# add default headers firstly

for eachDefHd in defHeaderDict.keys() :

#print "add default header: %s=%s"%(eachDefHd,defHeaderDict[eachDefHd]);

req.add_header(eachDefHd, defHeaderDict[eachDefHd]);

if(useGzip) :

#print "use gzip for",url;

req.add_header('Accept-Encoding', 'gzip, deflate');

# add customized header later -> allow overwrite default header

if(headerDict) :

#print "added header:",headerDict;

for key in headerDict.keys() :

req.add_header(key, headerDict[key]);

if(timeout > 0) :

# set timeout value if necessary

resp = urllib2.urlopen(req, timeout=timeout);

else :

resp = urllib2.urlopen(req);

return resp;

#------------------------------------------------------------------------------

# get response html==body from url

#def getUrlRespHtml(url, postDict={}, headerDict={}, timeout=0, useGzip=False) :

def getUrlRespHtml(url, postDict={}, headerDict={}, timeout=0, useGzip=True) :

resp = getUrlResponse(url, postDict, headerDict, timeout, useGzip);

respHtml = resp.read();

if(useGzip) :

#print "---before unzip, len(respHtml)=",len(respHtml);

respInfo = resp.info();

# Server: nginx/1.0.8

# Date: Sun, 08 Apr 2012 12:30:35 GMT

# Content-Type: text/html

# Transfer-Encoding: chunked

# Connection: close

# Vary: Accept-Encoding

# ...

# Content-Encoding: gzip

# sometime, the request use gzip,deflate, but actually returned is un-gzip html

# -> response info not include above "Content-Encoding: gzip"

# eg: http://blog.sina.com.cn/s/comment_730793bf010144j7_3.html

# -> so here only decode when it is indeed is gziped data

if( ("Content-Encoding" in respInfo) and (respInfo['Content-Encoding'] == "gzip")) :

respHtml = zlib.decompress(respHtml, 16+zlib.MAX_WBITS);

#print "+++ after unzip, len(respHtml)=",len(respHtml);

return respHtml;

################################################################################

# Cookies

################################################################################

#------------------------------------------------------------------------------

# check all cookies in cookiesDict is exist in cookieJar or not

def checkAllCookiesExist(cookieNameList, cookieJar) :

cookiesDict = {};

for eachCookieName in cookieNameList :

cookiesDict[eachCookieName] = False;

allCookieFound = True;

for cookie in cookieJar :

if(cookie.name in cookiesDict) :

cookiesDict[cookie.name] = True;

for eachCookie in cookiesDict.keys() :

if(not cookiesDict[eachCookie]) :

allCookieFound = False;

break;

return allCookieFound;

################################################################################

# Image

################################################################################

# import Image,ImageEnhance,ImageFilter;

# def testCaptcha():

# #http://www.pythonclub.org/project/captcha/python-pil

# #image_name = "20120409_134346_captcha.jpg";

# #image_name = "20120409_134531_captcha.jpg";

# #image_name = "20120409_134625_captcha.jpg";

# #image_name = "20120409_134928_captcha.jpg";

# image_name = "20120409_135233_captcha.jpg";

# im = Image.open(image_name);

# print "open OK for=",image_name;

# filter = ImageFilter.MedianFilter();

# print "MedianFilter OK";

# im = im.filter(filter);

# print "filter OK";

# enhancer = ImageEnhance.Contrast(im);

# print "Contrast OK";

# im = enhancer.enhance(2);

# print "enhance OK";

# im = im.convert('1');

# print "convert OK";

# #im.show()

# #print "show OK";

# im.save(image_name + "_new.gif");

# print "save OK";

# ooooooooooooooooo

# #------------------------------------------------------------------------------

# # [uncompleted]

# # parse input picture file to captcha(verify code)

# def parseCaptchaFromPicFile(inputCaptFilename):

# #http://www.wausita.com/captcha/

# parsedCaptchaStr = "";

# # picFp = open(inputCaptFilename, "rb");

# # print "open pic file OK,picFp=",picFp;

# # picData = picFp.read();

# # print "read pic file OK";

# # picFp.close();

# # print "len(picData)=",len(picData);

# print "------------------capta test begin -----------------";

# captchaDir = "captcha";

# #inputCaptFilename = "returned_captcha.jpg";

# #inputCaptFilename = "captcha.gif";

# print "inputCaptFilename=",inputCaptFilename;

# inputCaptFilename = inputCaptFilename.split("/")[-1];

# captchaPicFile = captchaDir + "/" + inputCaptFilename;

# print "captchaPicFile=",captchaPicFile;

# im = Image.open(captchaPicFile);

# im = im.convert("P");

# im2 = Image.new("P", im.size, 255);

# temp = {};

# # 225 571

# # 219 253

# # 189 82

# # 132 64

# # 90 63

# # 224 63

# # 139 48

# # 182 47

# # 133 43

# # 96 39

# his = im.histogram();

# print im.histogram();

# values = {};

# for i in range(256):

# values[i] = his[i];

# mostCommonColor = sorted(values.items(), key=itemgetter(1), reverse=True)[:10];

# print type(mostCommonColor);

# print "-----most 0-9:-----";

# for key in mostCommonColor:

# #print type(key);

# print key;

# startIdx = 0;

# endIdx = 3;

# outputGifName = captchaPicFile + "_from-%d_to-%d.gif"%(startIdx, endIdx);

# #mostCommonColor = mostCommonColor[0:3]; # good result -> 0.8 similar

# #mostCommonColor = mostCommonColor[0:2]; # not bad result -> 0.7 similar

# mostCommonColor = mostCommonColor[startIdx:endIdx];

# print "-----most %d-%d:-----"%(startIdx, endIdx);

# for j,k in mostCommonColor:

# print j,k;

# mostCommonColorDict = dict(mostCommonColor);

# print mostCommonColorDict;

# for x in range(im.size[1]):

# for y in range(im.size[0]):

# pix = im.getpixel((y,x));

# temp[pix] = pix;

# #if pix == 220 or pix == 227: # these are the numbers to get

# if pix in mostCommonColorDict:

# #print pix;

# im2.putpixel((y,x),0);

# im2.save(outputGifName);

# print "------------------capta test done -----------------";

# return parsedCaptchaStr;

################################################################################

# Functions that depend on third party lib

################################################################################

#------------------------------------------------------------------------------

# depend on chardet

# check whether the strToDect is ASCII string

def strIsAscii(strToDect) :

isAscii = False;

encInfo = chardet.detect(strToDect);

if (encInfo['confidence'] > 0.9) and (encInfo['encoding'] == 'ascii') :

isAscii = True;

return isAscii;

#------------------------------------------------------------------------------

# get the possible(possiblility > 0.5) charset of input string

def getStrPossibleCharset(inputStr) :

possibleCharset = "ascii";

#possibleCharset = "UTF-8";

encInfo = chardet.detect(inputStr);

#print "encInfo=",encInfo;

if (encInfo['confidence'] > 0.5):

possibleCharset = encInfo['encoding'];

return possibleCharset;

#return encInfo['encoding'];

#------------------------------------------------------------------------------

# depend on BeautifulSoup

# translate strToTranslate from fromLanguage to toLanguage

# return the translated unicode string

# some frequently used language abbrv:

# Chinese Simplified: zh-CN

# Chinese Traditional: zh-TW

# English: en

# German: de

# Japanese: ja

# Korean: ko

# French: fr

# more can be found at:

# http://code.google.com/intl/ru/apis/language/translate/v2/using_rest.html#language-params

def translateString(strToTranslate, fromLanguage="zh-CN", toLanguage="en"):

transOK = False;

translatedStr = strToTranslate;

transErr = '';

try :

# following refer: http://python.u85.us/viewnews-335.html

postDict = {'hl':'zh-CN', 'ie':'UTF-8', 'text':strToTranslate, 'langpair':"%s|%s"%(fromLanguage, toLanguage)};

googleTranslateUrl = 'http://translate.google.cn/translate_t';

resp = getUrlRespHtml(googleTranslateUrl, postDict);

#logging.debug("---------------google translate resp html:\n%s", resp);

except urllib2.URLError,reason :

transOK = False;

transErr = reason;

except urllib2.HTTPError,code :

transOK = False;

transErr = code;

else :

soup = BeautifulSoup(resp);

resultBoxSpan = soup.find(id='result_box');

if resultBoxSpan and resultBoxSpan.span and resultBoxSpan.span.string :

transOK = True;

#translatedStr = resultBoxSpan.span.string.encode('utf-8');

googleRetTransStr = resultBoxSpan.span.string;

translatedStr = unicode(googleRetTransStr);

# just record some special one:

# from:

#【转载】[SEP4020 u-boot] start.s 注释

# to:

# The 【reserved] [the SEP4020 u-boot] start.s comment

else :

transOK = False;

transErr = "can not extract translated string from returned result";

transErr = str(transErr);

if transOK :

return (transOK, translatedStr);

else :

return (transOK, transErr);

#------------------------------------------------------------------------------

# translate the Chinese Simplified(Zh-cn) string to English(en)

def transZhcnToEn(strToTrans) :

translatedStr = strToTrans;

transOK = False;

transErr = '';

if strIsAscii(strToTrans) :

transOK = True;

translatedStr = strToTrans;

else :

(transOK, translatedStr) = translateString(strToTrans, "zh-CN", "en");

return (transOK, translatedStr);

################################################################################

# BeautifulSoup

################################################################################

#------------------------------------------------------------------------------

#remove specific tag[key]=value in soup contents (list of BeautifulSoup.Tag/BeautifulSoup.NavigableString)

# eg:

# (1)

# removeSoupContentsTagAttr(soupContents, "p", "class", "cc-lisence")

# to remove

......

, from

# [

# u'\n',

#

......

,

# u'\u5bf9......\u3002',

#

跑题了。......我争取。

,

#
,

# u'\n',

#

# ]

# (2)

#contents = removeSoupContentsTagAttr(contents, "div", "class", "addfav", True);

# remove

.....
from:

# [u'\n',

#

,

#

# ...

#

......

# ...

#

,

# u'\n']

def removeSoupContentsTagAttr(soupContents, tagName, tagAttrKey, tagAttrVal="", recursive=False) :

global gVal;

#print "in removeSoupContentsClass";

#print "[",gVal['currentLevel'],"] input tagName=",tagName," tagAttrKey=",tagAttrKey," tagAttrVal=",tagAttrVal;

#logging.debug("[%d] input, %s[%s]=%s, soupContents:%s", gVal['currentLevel'],tagName,tagAttrKey,tagAttrVal, soupContents);

#logging.debug("[%d] input, %s[%s]=%s", gVal['currentLevel'],tagName, tagAttrKey, tagAttrVal);

filtedContents = [];

for singleContent in soupContents:

#logging.debug("current singleContent=%s",singleContent);

#logging.info("singleContent=%s", singleContent);

#print "type(singleContent)=",type(singleContent);

#print "singleContent.__class__=",singleContent.__class__;

#if(isinstance(singleContent, BeautifulSoup)):

#if(BeautifulSoup.Tag == singleContent.__class__):

#if(isinstance(singleContent, instance)):

#if(isinstance(singleContent, BeautifulSoup.Tag)):

if(isinstance(singleContent, Tag)):

#print "isinstance true";

#logging.debug("singleContent: name=%s, attrMap=%s, attrs=%s",singleContent.name, singleContent.attrMap, singleContent.attrs);

# if( (singleContent.name == tagName)

# and (singleContent.attrMap)

# and (tagAttrKey in singleContent.attrMap)

# and ( (tagAttrVal and (singleContent.attrMap[tagAttrKey]==tagAttrVal)) or (not tagAttrVal) ) ):

# print "++++++++found tag:",tagName,"[",tagAttrKey,"]=",tagAttrVal,"\n in:",singleContent;

# #print "dir(singleContent)=",dir(singleContent);

# logging.debug("found %s[%s]=%s in %s", tagName, tagAttrKey, tagAttrVal, singleContent.attrMap);

# above using attrMap, but attrMap has bug for:

#singleContent: name=script, attrMap=None, attrs=[(u'type', u'text/javascript'), (u'src', u'http://partner.googleadservices.com/gampad/google_service.js')]

# so use attrs here

#logging.debug("singleContent: name=%s, attrs=%s", singleContent.name, singleContent.attrs);

attrsDict = tupleListToDict(singleContent.attrs);

if( (singleContent.name == tagName)

and (singleContent.attrs)

and (tagAttrKey in attrsDict)

and ( (tagAttrVal and (attrsDict[tagAttrKey]==tagAttrVal)) or (not tagAttrVal) ) ):

#print "++++++++found tag:",tagName,"[",tagAttrKey,"]=",tagAttrVal,"\n in:",singleContent;

#print "dir(singleContent)=",dir(singleContent);

logging.debug("found %s[%s]=%s in %s", tagName, tagAttrKey, tagAttrVal, attrsDict);

else:

if(recursive):

#print "-----sub call";

gVal['currentLevel'] = gVal['currentLevel'] + 1;

#logging.debug("[%d] now will filter %s[%s=]%s, for singleContent.contents=%s", gVal['currentLevel'], tagName,tagAttrKey,tagAttrVal, singleContent.contents);

#logging.debug("[%d] now will filter %s[%s=]%s", gVal['currentLevel'], tagName,tagAttrKey,tagAttrVal);

filteredSingleContent = singleContent;

filteredSubContentList = removeSoupContentsTagAttr(filteredSingleContent.contents, tagName, tagAttrKey, tagAttrVal, recursive);

gVal['currentLevel'] = gVal['currentLevel'] -1;

filteredSingleContent.contents = filteredSubContentList;

#logging.debug("[%d] after filter, sub contents=%s", gVal['currentLevel'], filteredSingleContent);

#logging.debug("[%d] after filter contents", gVal['currentLevel']);

filtedContents.append(filteredSingleContent);

else:

#logging.debug("not recursive, append:%s", singleContent);

#logging.debug("not recursive, now append singleContent");

filtedContents.append(singleContent);

# name = singleContent.name;

# if(name == tagName):

# print "name is equal, name=",name;

# attrMap = singleContent.attrMap;

# print "attrMap=",attrMap;

# if attrMap:

# if tagAttrKey in attrMap:

# print "tagAttrKey=",tagAttrKey," in attrMap";

# if(tagAttrVal and (attrMap[tagAttrKey]==tagAttrVal)) or (not tagAttrVal):

# print "++++++++found tag:",tagName,"[",tagAttrKey,"]=",tagAttrVal,"\n in:",singleContent;

# #print "dir(singleContent)=",dir(singleContent);

# logging.debug("found tag, tagAttrVal=%s, %s[%s]=%s", tagAttrVal, tagName, tagAttrVal, attrMap[tagAttrKey]);

# else:

# print "key in attrMap, but value not equal";

# if(recursive):

# print "-----sub call 111";

# gVal['currentLevel'] = gVal['currentLevel'] + 1;

# singleContent = removeSoupContentsTagAttr(singleContent.contents, tagName, tagAttrKey, tagAttrVal, recursive);

# gVal['currentLevel'] = gVal['currentLevel'] -1;

# filtedContents.append(singleContent);

# else:

# print "key not in attrMap";

# if(recursive):

# print "-----sub call 222";

# gVal['currentLevel'] = gVal['currentLevel'] + 1;

# singleContent = removeSoupContentsTagAttr(singleContent.contents, tagName, tagAttrKey, tagAttrVal, recursive);

# gVal['currentLevel'] = gVal['currentLevel'] -1;

# filtedContents.append(singleContent);

# else:

# print "attrMap is None";

# if(recursive):

# print "-----sub call 333";

# gVal['currentLevel'] = gVal['currentLevel'] + 1;

# singleContent = removeSoupContentsTagAttr(singleContent.contents, tagName, tagAttrKey, tagAttrVal, recursive);

# gVal['currentLevel'] = gVal['currentLevel'] -1;

# filtedContents.append(singleContent);

# else:

# print "name not equal, name=",name," tagName=",tagName;

# if(recursive):

# print "-----sub call 444";

# gVal['currentLevel'] = gVal['currentLevel'] + 1;

# singleContent = removeSoupContentsTagAttr(singleContent.contents, tagName, tagAttrKey, tagAttrVal, recursive);

# gVal['currentLevel'] = gVal['currentLevel'] -1;

# filtedContents.append(singleContent);

else:

# is BeautifulSoup.NavigableString

#print "not BeautifulSoup instance";

filtedContents.append(singleContent);

#print "filterd contents=",filtedContents;

#logging.debug("[%d] before return, filtedContents=%s", gVal['currentLevel'], filtedContents);

return filtedContents;

#------------------------------------------------------------------------------

# convert soup contents into unicode string

def soupContentsToUnicode(soupContents) :

#method 1

mappedContents = map(CData, soupContents);

#print "mappedContents OK";

#print "type(mappedContents)=",type(mappedContents); #type(mappedContents)=

contentUni = ''.join(mappedContents);

#print "contentUni=",contentUni;

# #method 2

# originBlogContent = "";

# logging.debug("Total %d contents for original soup contents:", len(soupContents));

# for i, content in enumerate(soupContents):

# if(content):

# logging.debug("[%d]=%s", i, content);

# originBlogContent += unicode(content);

# else :

# logging.debug("[%d] is null", i);

# logging.debug("---method 1: map and join---\n%s", contentUni);

# logging.debug("---method 2: enumerate ---\n%s", originBlogContent);

# # -->> seem that two method got same blog content

#logging.debug("soup contents to unicode string OK");

return contentUni;

#------------------------------------------------------------------------------

# find the first BeautifulSoup.NavigableString from soup contents

def findFirstNavigableString(soupContents):

firstString = None;

for eachContent in soupContents:

# note here must import NavigableString from BeautifulSoup

if(isinstance(eachContent, NavigableString)):

firstString = eachContent;

break;

return firstString;

#------------------------------------------------------------------------------

if __name__=="crifanLib":

gVal['picSufChars'] = genSufList();

#print "gVal['picSufChars']=",gVal['picSufChars'];

#print "Imported: %s,\t%s"%( __name__, __VERSION__);

一键复制

编辑

Web IDE

原始数据

按行查看

历史

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值