123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960 |
- #!/usr/bin/env python3
- import argparse
- import jieba
- import jieba.posseg as posseg
- import sys
- from collections import Counter
- from wordcloud import WordCloud
- def readfile(fn):
- f = open(fn)
- s = ''
- for line in f:
- s = s + line
- f.close()
- return s
- def cnt(words, parts):
- if not parts is None:
- S = set()
- pt = parts.split(',')
- for x in pt:
- S.add(x)
- w = []
- for x in words:
- if parts is None or x.flag in S:
- w.append(x.word)
- return Counter(w)
- def main():
- parser = argparse.ArgumentParser()
- parser.add_argument("-i", "--input", help = "input file name")
- parser.add_argument("-p", "--parts",
- help = "the parts you want of speech, seperated by comma")
- parser.add_argument("-j", "--jobs", type = int,
- help = "the number of parallel jobs")
- parser.add_argument("-t", "--top", type=int,
- help = "the number of words you want to show on the cloud")
- parser.add_argument("-o", "--output", help = "output image file name")
- parser.add_argument("-f", "--fontpath",
- help = "the path to the TTF/OTF fonts to use")
- args = parser.parse_args()
- if not args.jobs is None:
- jieba.enable_parallel(args.jobs)
- if args.input is None:
- parser.print_help()
- return 1
- s = readfile(args.input)
- words = posseg.cut(s)
- c = cnt(words, args.parts)
- if not args.top is None:
- c = dict(c.most_common(args.top))
- print(c)
- wc = WordCloud(font_path = args.fontpath).generate_from_frequencies(c)
- wc.to_image().save(args.output)
- if __name__ == "__main__":
- sys.exit(main())
|