123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108 |
- #!/usr/bin/env python3
- #------------------------------------------------------------------------#
- # "THE BEER-WARE LICENSE" (Revision 42): #
- # <xry111@mengyan1223.wang> wrote this file. As long as you retain this #
- # notice you can do whatever you want with this stuff. If we meet some #
- # day, and you think this stuff is worth it, you can buy me a beer in #
- # return. #
- # Xi Ruoyao #
- #------------------------------------------------------------------------#
- import argparse
- import jieba
- import jieba.posseg as posseg
- import sys
- from collections import Counter
- from wordcloud import WordCloud
- def readfile(fn):
- f = open(fn)
- s = ''
- for line in f:
- s = s + line
- f.close()
- return s
- def cnt(words, parts):
- if not parts is None:
- S = set()
- pt = parts.split(',')
- for x in pt:
- S.add(x)
- w = []
- for x in words:
- if parts is None or x.flag in S:
- w.append(x.word)
- return Counter(w)
- def main():
- parser = argparse.ArgumentParser()
- parser.add_argument("-i", "--input", help = "input file name")
- parser.add_argument("-p", "--parts",
- help = "the parts you want of speech, seperated by comma")
- parser.add_argument("-j", "--jobs", type = int,
- help = "the number of parallel jobs")
- parser.add_argument("-t", "--top", type = int,
- help = "the number of words you want to show on the cloud")
- parser.add_argument("-o", "--output", help = "output image file name")
- parser.add_argument("-f", "--fontpath",
- help = "the path to the TTF/OTF fonts to use")
- parser.add_argument("-d", "--debug", type=bool, default=False,
- help = "if you want to dump debug information")
- parser.add_argument("-m", "--mask",
- help = "the path to the mask image")
- parser.add_argument("-c", "--contour", type = int, default = 0,
- help = "the width of contour if you want it")
- parser.add_argument("-x", "--width", type = int, default = 400,
- help = "the width of canvas")
- parser.add_argument("-y", "--height", type = int, default = 200,
- help = "the height of canvas")
- parser.add_argument("-r", "--prefer-horizontal", type = float,
- default = 0.9,
- help = "the ratio of times to try horizontal fitting")
- args = parser.parse_args()
- if not args.jobs is None:
- jieba.enable_parallel(args.jobs)
- if args.input is None:
- parser.print_help()
- return 1
- s = readfile(args.input)
- words = posseg.cut(s)
- c = cnt(words, args.parts)
- if not args.top is None:
- c = dict(c.most_common(args.top))
- if args.debug:
- print(c)
- # recognize .csv output and skip wordcloud
- if args.output[-4:] == '.csv':
- with open(args.output, mode="w") as ofile:
- for i in c:
- ofile.write(i + ',' + str(c[i]) + '\n')
- return
- mask = None
- if not args.mask is None:
- from PIL import Image
- import numpy as np
- # the mask need to be scaled
- img = Image.open(args.mask).resize((args.width, args.height))
- mask = np.array(img)
- wordcloud = WordCloud(font_path = args.fontpath,
- mask = mask,
- contour_width = args.contour,
- contour_color = "steelblue",
- width = args.width,
- height = args.height,
- prefer_horizontal = args.prefer_horizontal)
- wc = wordcloud.generate_from_frequencies(c)
- wc.to_image().save(args.output)
- if __name__ == "__main__":
- sys.exit(main())
|