python显示班级性别比例_python实现爬虫统计学校BBS男女比例之数据处理（三）

最新推荐文章于 2024-07-02 13:37:29 发布

weixin_39995280

最新推荐文章于 2024-07-02 13:37:29 发布

阅读量1.8k

点赞数 2

本文链接：https://blog.csdn.net/weixin_39995280/article/details/111433702

版权

python显示班级性别比例

__author__ = 'admin'

# encoding: UTF-8

#多线程处理程序

import threading

import time,os,sys

#全局变量

SUM = 0

BOY = 0

GIRL = 0

SECRET = 0

UNKOWN = 0

class StaFileList(threading.Thread):

#文本名称列表

fileList = []

def __init__(self, fileList):

threading.Thread.__init__(self)

self.fileList = fileList

def run(self):

global SUM, BOY, GIRL, SECRET

if mutex.acquire(1):

self.staManyFiles(self.fileList)

mutex.release()

#处理输入的files列表，统计男女人数

#注意这儿数据同步问题

def staCorrectFiles(self, files):

global SUM, BOY, GIRL, SECRET

for name in files:

newName = 'E:\\pythonProject\\ruisi\\%s' % (name)

readFile = open(newName,'r')

for line in readFile:

sexInfo = line.split()[1]

SUM +=1

if sexInfo == u'\u7537' :

BOY += 1

elif sexInfo == u'\u5973':

GIRL +=1

elif sexInfo == u'\u4fdd\u5bc6':

SECRET +=1

# print "thread %s, until %s, total is %s; %s boys; %s girls;" \

# " %s secret;" %(self.name, name, SUM, BOY,GIRL,SECRET)

def staManyFiles(self, files):

global SUM, BOY, GIRL, SECRET,UNKOWN

for name in files:

if name.startswith('correct') :

newName = 'E:\\pythonProject\\ruisi\\%s' % (name)

readFile = open(newName,'r')

for line in readFile:

sexInfo = line.split()[1]

SUM +=1

if sexInfo == u'\u7537' :

BOY += 1

elif sexInfo == u'\u5973':

GIRL +=1

elif sexInfo == u'\u4fdd\u5bc6':

SECRET +=1

# print "thread %s, until %s, total is %s; %s boys; %s girls;" \

# " %s secret;" %(self.name, name, SUM, BOY,GIRL,SECRET)

#没有活动时间，但是有性别

elif name.startswith("errTime"):

newName = 'E:\\pythonProject\\ruisi\\%s' % (name)

readFile = open(newName,'r')

for line in readFile:

sexInfo = line.split()[1]

SUM +=1

if sexInfo == u'\u7537' :

BOY += 1

elif sexInfo == u'\u5973':

GIRL +=1

elif sexInfo == u'\u4fdd\u5bc6':

SECRET +=1

# print "thread %s, until %s, total is %s; %s boys; %s girls;" \

# " %s secret;" %(self.name, name, SUM, BOY,GIRL,SECRET)

#没有性别，也没有时间，直接统计行数

elif name.startswith("unkownsex"):

newName = 'E:\\pythonProject\\ruisi\\%s' % (name)

# count = len(open(newName,'rU').readlines())

#对于大文件用循环方法，count 初始值为 -1 是为了应对空行的情况，最后+1得到0行

count = -1

for count, line in enumerate(open(newName, 'rU')):

pass

count += 1

UNKOWN += count

SUM += count

# print "thread %s, until %s, total is %s; %s unkownsex" %(self.name, name, SUM, UNKOWN)

def test():

files = []

#用来保存所有的线程，方便最后主线程等待所以子线程结束

staThreads = []

i = 0

for filename in os.listdir(r'E:\pythonProject\ruisi'):

#没获取10个文本，就创建一个线程

if filename.startswith("correct") or filename.startswith("errTime") or filename.startswith("unkownsex"):

files.append(filename)

i+=1

if i == 20 :

staThreads.append(StaFileList(files))

files = []

i = 0

#最后剩余的files，很可能长度不足10个

if files:

staThreads.append(StaFileList(files))

for t in staThreads:

t.start()

# 主线程中等待所有子线程退出

for t in staThreads:

t.join()

if __name__ == '__main__':

reload(sys)

sys.setdefaultencoding('utf-8')

startTime = time.clock()

mutex = threading.Lock()

test()

print "Multi Thread, total is %s; %s boys; %s girls; %s secret; %s unkownsex" %(SUM, BOY,GIRL,SECRET,UNKOWN)

endTime = time.clock()

print "cost time " + str(endTime - startTime) + " s"

endTime = time.clock()

print "cost time " + str(endTime - startTime) + " s"

输出为

Multi Thread, total is 61111; 13937 boys; 4009 girls; 28942 secret;

cost time 1.23049112201 s可以看出多线程还是优于单线程的，由于使用的同步，数据统计是一直的。

注意python在类内部经常需要加上self，这点和java区别很大。

def __init__(self, fileList):

threading.Thread.__init__(self)

self.fileList = fileList

def run(self):

global SUM, BOY, GIRL, SECRET

if mutex.acquire(1):

#调用类内部方法需要加self

self.staFiles(self.fileList)

mutex.release()

total is 61111; 13937 boys; 4009 girls; 28942 secret; 14223 unkownsex;

cost time 1.25413238673 s

以上就是本文的全部内容，希望对大家的学习有所帮助。

本文原创发布php中文网，转载请注明出处，感谢您的尊重！