xry111
/
wordfreq


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
							#!/usr/bin/env python3

#------------------------------------------------------------------------#
# "THE BEER-WARE LICENSE" (Revision 42):                                 #
# <xry111@mengyan1223.wang> wrote this file.  As long as you retain this #
# notice you can do whatever you want with this stuff. If we meet some   #
# day, and you think this stuff is worth it, you can buy me a beer in    #
# return.                                                                #
#                                                       Xi Ruoyao        #
#------------------------------------------------------------------------#

import argparse
import jieba
import jieba.posseg as posseg
import sys

from collections import Counter
from wordcloud import WordCloud

def readfile(fn):
    f = open(fn)
    s = ''
    for line in f:
        s = s + line
    f.close()
    return s

def cnt(words, parts):
    if not parts is None:
        S = set()
        pt = parts.split(',')
        for x in pt:
            S.add(x)

    w = []
    for x in words:
        if parts is None or x.flag in S:
            w.append(x.word)
    return Counter(w)

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--input", help = "input file name")
    parser.add_argument("-p", "--parts",
            help = "the parts you want of speech, seperated by comma")
    parser.add_argument("-j", "--jobs", type = int,
            help = "the number of parallel jobs")
    parser.add_argument("-t", "--top", type = int,
            help = "the number of words you want to show on the cloud")
    parser.add_argument("-o", "--output", help = "output image file name")
    parser.add_argument("-f", "--fontpath",
            help = "the path to the TTF/OTF fonts to use")
    parser.add_argument("-d", "--debug", type=bool, default=False,
            help = "if you want to dump debug information")
    parser.add_argument("-m", "--mask",
            help = "the path to the mask image")
    parser.add_argument("-c", "--contour", type = int, default = 0,
            help = "the width of contour if you want it")
    parser.add_argument("-x", "--width", type = int, default = 400,
            help = "the width of canvas")
    parser.add_argument("-y", "--height", type = int, default = 200,
            help = "the height of canvas")
    parser.add_argument("-r", "--prefer-horizontal", type = float,
            default = 0.9,
            help = "the ratio of times to try horizontal fitting")
    args = parser.parse_args()

    if not args.jobs is None:
        jieba.enable_parallel(args.jobs)
    if args.input is None:
        parser.print_help()
        return 1
    s = readfile(args.input)
    words = posseg.cut(s)
    c = cnt(words, args.parts)
    if not args.top is None:
        c = dict(c.most_common(args.top))

    if args.debug:
        print(c)

    # recognize .csv output and skip wordcloud
    if args.output[-4:] == '.csv':
        with open(args.output, mode="w") as ofile:
            for i in c:
                ofile.write(i + ',' + str(c[i]) + '\n')
        return

    mask = None
    if not args.mask is None:
        from PIL import Image
        import numpy as np
        # the mask need to be scaled
        img = Image.open(args.mask).resize((args.width, args.height))
        mask = np.array(img)

    wordcloud = WordCloud(font_path = args.fontpath,
            mask = mask,
            contour_width = args.contour,
            contour_color = "steelblue",
            width = args.width,
            height = args.height,
            prefer_horizontal = args.prefer_horizontal)
    wc = wordcloud.generate_from_frequencies(c)
    wc.to_image().save(args.output)

if __name__ == "__main__":
    sys.exit(main())