1.numpy.hstack:将两个array连上
import numpy as np
a = np.array((1,2,3))
b = np.array((4,5,6))
ab2vec = np.hstack((a,b))
print(ab2vec)
a = np.array([[1],[2],[3]])
b = np.array([[4],[5],[6]])
ab2vec = np.hstack((a,b))
print(ab2vec)
a = np.array([1,2,3])
b = np.array([4,5,6])
ab2vec = np.hstack((a,b))
print(ab2vec)
a = np.array([[1,2],[3,4],[5,6]])
b = np.array([[7,8],[9,10],[11,12]])
ab2vec = np.hstack((a,b))
print(ab2vec)
输出结果:
[1 2 3 4 5 6]
[[1 4]
[2 5]
[3 6]]
[1 2 3 4 5 6]
[[ 1 2 7 8]
[ 3 4 9 10]
[ 5 6 11 12]]
[Finished in 0.2s]
2.日期格式转换
from datetime import datetime, timedelta
# import datetime
curr_date = '20180101'
curr_date = curr_date[0:4] + "-" + curr_date[4:6] + "-" + curr_date[6:]
print(curr_date, type(curr_date))
curr_date = datetime.strptime(curr_date,"%Y-%m-%d").date()
print(curr_date)
输出:
2018-01-01 <class 'str'>
2018-01-01
[Finished in 0.9s]
3.array相关
import numpy as np
a = np.array((1,2,3))
b = np.array((4,5,6))
ab2vec = np.hstack((a,b))
print(ab2vec)
a = np.array([[1],[2],[3]])
b = np.array([[4],[5],[6]])
ab2vec = np.hstack((a,b))
print(ab2vec)
a = np.array([1,2,3])
b = np.array([4,5,6])
ab2vec = np.hstack((a,b))
print(ab2vec)
a = np.array([[1,2],[3,4],[5,6]])
b = np.array([[7,8],[9,10],[11,12]])
ab2vec = np.hstack((a,b))
print(ab2vec)
labels ={
}
labels.setdefault("love",1)
print(labels.get("love"))
labels["love"] += 1
print(labels.get("love"))
输出:
[1 2 3 4 5 6]
[[1 4]
[2 5]
[3 6]]
[1 2 3 4 5 6]
[[ 1 2 7 8]
[ 3 4 9 10]
[ 5 6 11 12]]
1
2
[Finished in 1.5s]
4.异常值处理
data = '-'
try:
data = float(data)
if data <= 0:
print('True')
# return True
except:
print('True')
# return True
else:
print('False')
# return False
输出:
True
[Finished in 0.2s]
5.sql 数据库更新
import pymysql
db = pymysql.connect("localhost","root","root","wumai")
cursor = db.cursor()
# table = 'tt1_pm'
table = 'a_new'
print(table)
date = '20140822'
time = '11'
id = 2
# sql = "select count(*) from %s WHERE id = 1"%(table)
# sql = "select * from %s WHERE date = %s and time = %s"%(table, date, time)
sql = """UPDATE %s SET date = %s, time = %s
WHERE id = %s
"""%(table, date, time, id)
print(sql)
cursor.execute(sql)
results = cursor.fetchall()
print(results)
db.commit()
db.close()
输出:
a_new
UPDATE a_new SET date = 20140822, time = 11
WHERE id = 2
()
[Finished in 1.9s]
6.返回对象大小(单位:字节)
sys.getsizeof(data)
7.定时运行
import time
import sys, os.path
from sys import argv
from os import system, remove
from string import *
# from subr import *
def print_ts(message):
# print "[%s] %s"%(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), message)
print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), message)
def run(interval, command):
print_ts("-"*100)
print_ts("Command %s"%command)
print_ts("Starting every %s seconds."%interval)
print_ts("-"*100)
while True:
try:
# sleep for the remaining seconds of interval
time_remaining = interval-time.time()%interval
print_ts("Sleeping until %s (%s seconds)..."%((time.ctime(time.time()+time_remaining)), time_remaining))
time.sleep(time_remaining)
print_ts("Starting command.")
# execute the command
status = os.system(command)
print_ts("-"*100)
print_ts("Command status = %s."%status)
except Exception as e:
print(e)
if __name__=="__main__":
interval = 5
# command = r"ls"
command = "python print.py"
run(interval, command)
输出:
2018-09-01 15:31:50 ----------------------------------------------------------------------------------------------------
2018-09-01 15:31:50 Command python print.py
2018-09-01 15:31:50 Starting every 5 seconds.
2018-09-01 15:31:50 ----------------------------------------------------------------------------------------------------
2018-09-01 15:31:50 Sleeping until Sat Sep 1 15:31:55 2018 (4.137838363647461 seconds)...
2018-09-01 15:31:55 Starting command.
python: can't open file 'print.py': [Errno 2] No such file or directory
2018-09-01 15:31:55 ----------------------------------------------------------------------------------------------------
2018-09-01 15:31:55 Command status = 2.
2018-09-01 15:31:55 Sleeping until Sat Sep 1 15:32:00 2018 (4.814587116241455 seconds)...
2018-09-01 15:32:00 Starting command.
python: can't open file 'print.py': [Errno 2] No such file or directory
2018-09-01 15:32:00 ----------------------------------------------------------------------------------------------------
2018-09-01 15:32:00 Command status = 2.
2018-09-01 15:32:00 Sleeping until Sat Sep 1 15:32:05 2018 (4.816297769546509 seconds)...
2018-09-01 15:32:05 Starting command.
python: can't open file 'print.py': [Errno 2] No such file or directory
2018-09-01 15:32:05 ----------------------------------------------------------------------------------------------------
2018-09-01 15:32:05 Command status = 2.
2018-09-01 15:32:05 Sleeping until Sat Sep 1 15:32:10 2018 (4.799007892608643 seconds)...
2018-09-01 15:32:10 Starting command.
python: can't open file 'print.py': [Errno 2] No such file or directory
2018-09-01 15:32:10 ----------------------------------------------------------------------------------------------------
2018-09-01 15:32:10 Command status = 2.
2018-09-01 15:32:10 Sleeping until Sat Sep 1 15:32:15 2018 (4.847720384597778 seconds)...
[Cancelled]
8.批量处理html格式数据
#-*- coding:utf-8 -*-
import re
##过滤HTML中的标签
# 将HTML中标签等信息去掉
# @param htmlstr HTML字符串.
def filter_tags(htmlstr):
# 先过滤CDATA
re_cdata = re.compile("//<!CDATA\[[>]∗//\]>", re.I) #匹配CDATA
re_script = re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>', re.I) # Script
re_style = re.compile('<\s*style[^>]*>[^<]*<\s*/\s*style\s*>', re.I) # style
re_br = re.compile('<br\s*?/?>') # 处理换行
re_h = re.compile('</?\w+[^>]*>') # HTML标签
re_comment = re.compile('<!--[^>]*-->') # HTML注释
re_comment2 = re.compile('<!--[^>]*>') # HTML注释
re_comment3 = re.compile('<![^>]*-->') # HTML注释
re_comment4 = re.compile('<![^>]*>') # HTML注释
re_comment5 = re.compile('<![^>]*>') # HTML注释
# re_comment4 = re.compile('/\s*[^>]*\s*/') # HTML注释
s = re_cdata.sub('', htmlstr) # 去掉CDATA
s = re_script.sub('', s) # 去掉SCRIPT
s = re_style.sub('', s) # 去掉style
s = re_br.sub('\n', s) # 将br转换为换行
s = re_h.sub('', s) # 去掉HTML 标签
s = re_comment.sub('', s) # 去掉HTML注释
s = re_comment2.sub('', s) # 去掉HTML注释
s = re_comment3.sub('', s) # 去掉HTML注释
s = re_comment4.sub('', s) # 去掉HTML注释
# 去掉多余的空行
blank_line = re.compile('\n+')
s = blank_line.sub('\n', s)
s = replaceCharEntity(s) # 替换实体
return s
##替换常用HTML字符实体.
# 使用正常的字符替换HTML中特殊的字符实体.
# 你可以添加新的实体字符到CHAR_ENTITIES中,处理更多HTML字符实体.
# @param htmlstr HTML字符串.
def replaceCharEntity(htmlstr):
CHAR_ENTITIES = {'nbsp': ' ', '160': ' ',
'lt': '<', '60': '<',
'gt': '>', '62': '>',
'amp': '&', '38': '&',
'quot': '"''"', '34': '"', }
re_charEntity = re.compile(r'&#?(?P<name>\w+);')
sz = re_charEntity.search(htmlstr)
while sz:
entity = sz.group() # entity全称,如>
key = sz.group('name') # 去除&;后entity,如>为gt
try:
htmlstr = re_charEntity.sub(CHAR_ENTITIES[key], htmlstr, 1)
sz = re_charEntity.search(htmlstr)
except KeyError:
# 以空串代替
htmlstr = re_charEntity.sub('', htmlstr, 1)
sz = re_charEntity.search(htmlstr)
return htmlstr
def repalce(s, re_exp, repl_string):
return re_exp.sub(repl_string, s)
if __name__ == '__main__':
# s = file('test.html').read()
# s = '<p><b>Technical communication</b> is the process of conveying technical information through writing, speech, and other mediums to a specific audience. Information is usable if the intended audience can perform an action or make a decision based on it (Johnson-Sheehan 7). Technical communicators often work collaboratively to create products (<a href="/wiki/Deliverable" title="Deliverable">deliverables</a>) for various media, including paper, video, and the Internet. <a href="/wiki/Deliverable" title="Deliverable">Deliverables</a> include <a href="/wiki/Online_help" title="Online help">online help</a> user <a href="/wiki/User_guide" title="User guide">manuals</a>, technical manuals, <a href="/wiki/Specification" title="Specification" class="mw-redirect">specifications</a>, process and procedure manuals, <a href="/wiki/Reference_card" title="Reference card">reference cards</a>, training, business papers and reports.</p>'
import os
path = "D:/LDA/dataset/Wiki10/Wiki10_RawData" #文件夹目录
files= os.listdir(path) #得到文件夹下的所有文件名称
# print("files = ",files)
# s = []
file_proc = 'D:/LDA/dataset/Wiki10/Wiki10_RawData/processed_data/data_'
i = -1 ;
for file in files: #遍历文件夹
# print(file)
if not os.path.isdir(file): #判断是否是文件夹,不是文件夹才打开
f = open(path+"/"+file,'r', encoding='UTF-8') #打开文件
# f = open(path+"/"+file,'r', encoding='GBK') #打开文件
# f = open(path+"/"+file,'r',encoding='unicode') #打开文件
i = i + 1
file_proccc = file_proc+ str(i) + ".txt"
with open(file_proccc, 'w', encoding='UTF-8') as file:
for line in f:
# line = line.decode('UTF-8','ignore');
news = filter_tags(line)
# print(news)
if(news != '\n'):
file.writelines(news+'\n')
f.close()
# iter_f = iter(f); #创建迭代器
# str = ""
# for line in iter_f: #遍历文件,一行行遍历,读取文本
# str = str + line
# s.append(str) #每个文件的文本存到list中
# print(s) #打印结果
# filename = 'D:/LDA/dataset/Wiki10/Wiki10_RawData/000e9edf0163688ef62a4592546109fb'
# f = open(filename, 'r', encoding='UTF-8')
# with open(file_proc, 'w') as file:
# for line in f:
# news = filter_tags(line)
# # print(news)
# # print(type(news))
# # lis = []
# # lis.append(news)
# # print(lis)
# if(news != '\n'):
# file.writelines(news+'\n')
# f.close()