python多线程模块--threading三部曲（一）

最新推荐文章于 2024-08-27 16:38:10 发布

leah_provence

最新推荐文章于 2024-08-27 16:38:10 发布

阅读量371

点赞数

分类专栏： python之多线程文章标签：多线程 python linux

本文链接：https://blog.csdn.net/leah_provence/article/details/71516587

版权

python之多线程专栏收录该内容

1 篇文章 0 订阅

订阅专栏

                            Threading（一）

当时你还小，手动把一个大文件分成了两个。添了几行代码，简陋的多线程。程序用时少了一半吧，你还笑了。<纪念第一次threading-傻>

注：linux分割文件：
split -l 300(分割行数) large_file new_file （指定分割后行数）
split -b 10m（文件大小） large_file new_file （指定分割后大小）

import threading
import urllib2
import sys
import csv
import re
import datetime

def read_csv(csv_file):
#读取csv文件，提取需要字段拼接成url，存入列表，最后返回一个列表
    with open(csv_file,'rb') as f:
        # csv.Error: line contains NULL byte，这个bug用下面一行解决的
        reader_csv = csv.reader(line.replace('\0','')for line in f)
        l_original = []
        for item in reader_csv:
            #item是包含一个字符串的列表，字段之间用‘#=：’拼接而成的字符串
            iitem = item[0].split('#=:')
            if len(iitem)>=10:
                #拼接url，这里隐藏部分url细节
                url = 'http://mp.weixin.??. ='+iitem[7]+'==&mid='+iitem[8]+'&idx='+iitem[-1]+'&sn='+iitem[9]
                l_original.append(url)

        return l_original

def get_html(csv_file):
#根据url请求网页，用正则提取所需内容
    l_out = []
    l_urls = read_csv(csv_file)
    n=1
    for url_wx in l_urls:
        print n
        n+=1   
        l_info = []  #此列表用来存储需要存入的字段
        biz = url_wx.split('=')[1]
        l_info.append(biz) #需要字段1
        try:
            #获取网页的代码，正则提取所需字段
            html = urllib2.urlopen(url_wx).read()
            res_name = r'<strong class="profile_nickname">(.*?)</strong>'
            info_name = re.findall(res_name,html,re.S|re.M)
            if info_name == []:
                info_name = ['none']
            l_info.append(info_name[0]) #需要字段2
            res_value = r'<span class="profile_meta_value">(.*?)</span>'
            info_value = re.findall(res_value,html,re.S|re.M)
            if info_value ==[]:
                info_value = ['none']
            l_info.append(info_value[0])  #需要字段3
            l_out.append(l_info)  #大列表存储所有需要输出的内容
        except:
            l_info.append(url_wx)
            l_out.append(l_info)
    return l_out
def write_csv(csv_file):
#将列表写入输出的csv文件中
    l_out = get_html(csv_file)
    file_result = open('kol_result_one.csv','w+')
    writer = csv.writer(file_result)
    for i in l_out:
        writer.writerow(i)
def write_csv_2(csv_file):
    l_out = get_html(csv_file)
    file_result = open('kol_result_two.csv','w+')
    writer = csv.writer(file_result)
    for i in l_out:
        writer.writerow(i)
#创建了两个线程
threads = []
t1 = threading.Thread(target=write_csv,args=(sys.argv[1],))
threads.append(t1)
t2 = threading.Thread(target=write_csv_2,args=(sys.argv[2],))
threads.append(t2)

def main():

    starttime = datetime.datetime.now()
    #write_csv(csv_file=sys.argv[1])
    #write_csv_2(csv_file=sys.argv[2])
    for t in threads:
        #t.setDaemon(True)
        t.start()
    endtime = datetime.datetime.now()
    #打印程序用时
    print (endtime-starttime).seconds

if __name__ == '__main__':
    main()

进化后的代码，随后跟进.

leah_provence

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python多线程模块--threading三部曲（一）

import threadingimport urllib2import sysimport csvimport reimport datetimedef read_csv(csv_file):#读取csv文件 with open(csv_file,'rb') as f: reader_csv = csv.reader(line.replace('\0',''
复制链接

扫一扫

专栏目录