#!/usr/bin/env python3 #------------------------------------------------------------------------# # "THE BEER-WARE LICENSE" (Revision 42): # # wrote this file. As long as you retain this # # notice you can do whatever you want with this stuff. If we meet some # # day, and you think this stuff is worth it, you can buy me a beer in # # return. # # Xi Ruoyao # #------------------------------------------------------------------------# import argparse import jieba import jieba.posseg as posseg import sys from collections import Counter from wordcloud import WordCloud def readfile(fn): f = open(fn) s = '' for line in f: s = s + line f.close() return s def cnt(words, parts): if not parts is None: S = set() pt = parts.split(',') for x in pt: S.add(x) w = [] for x in words: if parts is None or x.flag in S: w.append(x.word) return Counter(w) def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", help = "input file name") parser.add_argument("-p", "--parts", help = "the parts you want of speech, seperated by comma") parser.add_argument("-j", "--jobs", type = int, help = "the number of parallel jobs") parser.add_argument("-t", "--top", type = int, help = "the number of words you want to show on the cloud") parser.add_argument("-o", "--output", help = "output image file name") parser.add_argument("-f", "--fontpath", help = "the path to the TTF/OTF fonts to use") parser.add_argument("-d", "--debug", type=bool, default=False, help = "if you want to dump debug information") parser.add_argument("-m", "--mask", help = "the path to the mask image") parser.add_argument("-c", "--contour", type = int, default = 0, help = "the width of contour if you want it") parser.add_argument("-x", "--width", type = int, default = 400, help = "the width of canvas") parser.add_argument("-y", "--height", type = int, default = 200, help = "the height of canvas") parser.add_argument("-r", "--prefer-horizontal", type = float, default = 0.9, help = "the ratio of times to try horizontal fitting") args = parser.parse_args() if not args.jobs is None: jieba.enable_parallel(args.jobs) if args.input is None: parser.print_help() return 1 s = readfile(args.input) words = posseg.cut(s) c = cnt(words, args.parts) if not args.top is None: c = dict(c.most_common(args.top)) if args.debug: print(c) # recognize .csv output and skip wordcloud if args.output[-4:] == '.csv': with open(args.output, mode="w") as ofile: for i in c: ofile.write(i + ',' + str(c[i]) + '\n') return mask = None if not args.mask is None: from PIL import Image import numpy as np # the mask need to be scaled img = Image.open(args.mask).resize((args.width, args.height)) mask = np.array(img) wordcloud = WordCloud(font_path = args.fontpath, mask = mask, contour_width = args.contour, contour_color = "steelblue", width = args.width, height = args.height, prefer_horizontal = args.prefer_horizontal) wc = wordcloud.generate_from_frequencies(c) wc.to_image().save(args.output) if __name__ == "__main__": sys.exit(main())