wordfreq.py 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. #!/usr/bin/env python3
  2. #------------------------------------------------------------------------#
  3. # "THE BEER-WARE LICENSE" (Revision 42): #
  4. # <xry111@mengyan1223.wang> wrote this file. As long as you retain this #
  5. # notice you can do whatever you want with this stuff. If we meet some #
  6. # day, and you think this stuff is worth it, you can buy me a beer in #
  7. # return. #
  8. # Xi Ruoyao #
  9. #------------------------------------------------------------------------#
  10. import argparse
  11. import jieba
  12. import jieba.posseg as posseg
  13. import sys
  14. from collections import Counter
  15. from wordcloud import WordCloud
  16. def readfile(fn):
  17. f = open(fn)
  18. s = ''
  19. for line in f:
  20. s = s + line
  21. f.close()
  22. return s
  23. def cnt(words, parts):
  24. if not parts is None:
  25. S = set()
  26. pt = parts.split(',')
  27. for x in pt:
  28. S.add(x)
  29. w = []
  30. for x in words:
  31. if parts is None or x.flag in S:
  32. w.append(x.word)
  33. return Counter(w)
  34. def main():
  35. parser = argparse.ArgumentParser()
  36. parser.add_argument("-i", "--input", help = "input file name")
  37. parser.add_argument("-p", "--parts",
  38. help = "the parts you want of speech, seperated by comma")
  39. parser.add_argument("-j", "--jobs", type = int,
  40. help = "the number of parallel jobs")
  41. parser.add_argument("-t", "--top", type=int,
  42. help = "the number of words you want to show on the cloud")
  43. parser.add_argument("-o", "--output", help = "output image file name")
  44. parser.add_argument("-f", "--fontpath",
  45. help = "the path to the TTF/OTF fonts to use")
  46. parser.add_argument("-d", "--debug", type=bool, default=False,
  47. help = "if you want to dump debug information")
  48. args = parser.parse_args()
  49. if not args.jobs is None:
  50. jieba.enable_parallel(args.jobs)
  51. if args.input is None:
  52. parser.print_help()
  53. return 1
  54. s = readfile(args.input)
  55. words = posseg.cut(s)
  56. c = cnt(words, args.parts)
  57. if not args.top is None:
  58. c = dict(c.most_common(args.top))
  59. if args.debug:
  60. print(c)
  61. wc = WordCloud(font_path = args.fontpath).generate_from_frequencies(c)
  62. wc.to_image().save(args.output)
  63. if __name__ == "__main__":
  64. sys.exit(main())