Python开发【笔记】:从海量文件的目

发布时间:2019-05-20 22:49:55编辑:auto阅读(1849)

    Python获取文件名的方法性能对比

     

    前言:平常在python中从文件夹中获取文件名的简单方法   os.system('ll /data/')   但是当文件夹中含有巨量文件时,这种方式完全是行不通的;

     

    在/dd目录中生成了近6百万个文件,接下来看看不同方法之间的性能对比  快速生成文件的shell脚本  

    for i in $(seq 1 1000000);do echo text >>$i.txt;done
    

      

     

     1、系统命令 ls -l

    # 系统命令 ls -l
    
    import time
    import subprocess
    
    start = time.time()
    result = subprocess.Popen('ls -l /dd/', stdout=subprocess.PIPE,shell=True)
    
    for file in result.stdout:
        pass
    print(time.time()-start)
    
    # 直接卡死
    

      

    2、glob 模块

    # glob 模块
    
    import glob
    import time
    
    
    start = time.time()
    result = glob.glob("/dd/*")
    for file in result:
        pass
    print(time.time()-start)
    
    # 49.60481119155884
    

      

    3、os.walk 模块

    # os.walk 模块
    
    import os
    import time
    
    start = time.time()
    for root, dirs, files in os.walk("/dd/", topdown=False):
            pass
    print(time.time()-start)
    
    # 8.906772375106812
    

      

    4、os.scandir 模块

    # os.scandir 模块
    
    import os
    import time
    
    start = time.time()
    path = os.scandir("/dd/")
    for i in path:
        pass
    print(time.time()-start)
    
    # 4.118424415588379
    

      

    5、shell find命令

    # shell find命令
    
    import time
    import subprocess
    
    start = time.time()
    result = subprocess.Popen('find /dd/', stdout=subprocess.PIPE,shell=True)
    
    for file in result.stdout:
        pass
    print(time.time()-start)
    
    # 6.205533027648926
    

      

    6、shell ls -1 -f 命令 不进行排序

    # shell ls -1  -f 命令
    
    import time
    import subprocess
    
    start = time.time()
    result = subprocess.Popen('ls -1 -f /dd/', stdout=subprocess.PIPE,shell=True)
    
    for file in result.stdout:
        pass
    print(time.time()-start)
    
    # 3.3476643562316895
    

      

    7、os.listdir

    # os.listdir
    
    import os
    import time
    
    
    start = time.time()
    result = os.listdir('/dd')
    for file in result:
        pass
    print(time.time()-start)
    
    # 2.6720399856567383
    

      

     

关键字