wordfreq.py 1.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960
  1. #!/usr/bin/env python3
  2. import argparse
  3. import jieba
  4. import jieba.posseg as posseg
  5. import sys
  6. from collections import Counter
  7. from wordcloud import WordCloud
  8. def readfile(fn):
  9. f = open(fn)
  10. s = ''
  11. for line in f:
  12. s = s + line
  13. f.close()
  14. return s
  15. def cnt(words, parts):
  16. if not parts is None:
  17. S = set()
  18. pt = parts.split(',')
  19. for x in pt:
  20. S.add(x)
  21. w = []
  22. for x in words:
  23. if parts is None or x.flag in S:
  24. w.append(x.word)
  25. return Counter(w)
  26. def main():
  27. parser = argparse.ArgumentParser()
  28. parser.add_argument("-i", "--input", help = "input file name")
  29. parser.add_argument("-p", "--parts",
  30. help = "the parts you want of speech, seperated by comma")
  31. parser.add_argument("-j", "--jobs", type = int,
  32. help = "the number of parallel jobs")
  33. parser.add_argument("-t", "--top", type=int,
  34. help = "the number of words you want to show on the cloud")
  35. parser.add_argument("-o", "--output", help = "output image file name")
  36. parser.add_argument("-f", "--fontpath",
  37. help = "the path to the TTF/OTF fonts to use")
  38. args = parser.parse_args()
  39. if not args.jobs is None:
  40. jieba.enable_parallel(args.jobs)
  41. if args.input is None:
  42. parser.print_help()
  43. return 1
  44. s = readfile(args.input)
  45. words = posseg.cut(s)
  46. c = cnt(words, args.parts)
  47. if not args.top is None:
  48. c = dict(c.most_common(args.top))
  49. print(c)
  50. wc = WordCloud(font_path = args.fontpath).generate_from_frequencies(c)
  51. wc.to_image().save(args.output)
  52. if __name__ == "__main__":
  53. sys.exit(main())