xry111
/
wordfreq


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960
							#!/usr/bin/env python3

import argparse
import jieba
import jieba.posseg as posseg
import sys

from collections import Counter
from wordcloud import WordCloud

def readfile(fn):
    f = open(fn)
    s = ''
    for line in f:
        s = s + line
    f.close()
    return s

def cnt(words, parts):
    if not parts is None:
        S = set()
        pt = parts.split(',')
        for x in pt:
            S.add(x)

    w = []
    for x in words:
        if parts is None or x.flag in S:
            w.append(x.word)
    return Counter(w)

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--input", help = "input file name")
    parser.add_argument("-p", "--parts",
            help = "the parts you want of speech, seperated by comma")
    parser.add_argument("-j", "--jobs", type = int,
            help = "the number of parallel jobs")
    parser.add_argument("-t", "--top", type=int,
            help = "the number of words you want to show on the cloud")
    parser.add_argument("-o", "--output", help = "output image file name")
    parser.add_argument("-f", "--fontpath",
            help = "the path to the TTF/OTF fonts to use")
    args = parser.parse_args()
    if not args.jobs is None:
        jieba.enable_parallel(args.jobs)
    if args.input is None:
        parser.print_help()
        return 1
    s = readfile(args.input)
    words = posseg.cut(s)
    c = cnt(words, args.parts)
    if not args.top is None:
        c = dict(c.most_common(args.top))
    print(c)
    wc = WordCloud(font_path = args.fontpath).generate_from_frequencies(c)
    wc.to_image().save(args.output)

if __name__ == "__main__":
    sys.exit(main())