Python文本去重

发布时间:2019-09-18 07:27:52编辑:auto阅读(2450)

    用法:命令行python unique.py -f file.txt
    输出:去除重复字符后的output.txt

    # -*- coding:utf-8 -*-
    #auther_cclarence_2016_4_6
    #open file and read out the characters
    from optparse import OptionParser
    import sys
    reload(sys)
    sys.setdefaultencoding("utf-8")
    def readfile(filename):
        try:
            f = open(filename)
        except Exception, e:
            print "No such file"
            exit(0)
        text = f.readlines()
        f.close()
        for i in range(0,len(text)-1):
            text[i] = text[i][:-1]
        return text
    #deduplication
    def unique(arr):
        arr1 = list(set(arr))
        arr1.sort(key = arr.index)
        return arr1
    def main():
        parser = OptionParser()
        parser.add_option("-f", "--file", dest="filename",help="write report to FILE", metavar="FILE")
        (options, args) = parser.parse_args()
        filename = options.filename
        text = readfile(filename)
        text_dealed = unique(text)
        for i in range(0,len(text_dealed)-1):
            text_dealed[i] = text_dealed[i] +'\n'
        f = open("output.txt","w")
        f.writelines(text_dealed)
        f.close()
        deduplication_num = len(text) - len(text_dealed) 
        print "success"
        print "The num of data from the source file        :" + str(len(text))
        print "The num of data from the preprocessed file: :" + str(len(text_dealed))
        print "The num of data removed                     :" + str(deduplication_num)
    if __name__ == '__main__':
        main()

关键字