wordfreq.py 2.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. #!/usr/bin/env python3
  2. #------------------------------------------------------------------------#
  3. # "THE BEER-WARE LICENSE" (Revision 42): #
  4. # <xry111@mengyan1223.wang> wrote this file. As long as you retain this #
  5. # notice you can do whatever you want with this stuff. If we meet some #
  6. # day, and you think this stuff is worth it, you can buy me a beer in #
  7. # return. #
  8. # Xi Ruoyao #
  9. #------------------------------------------------------------------------#
  10. import argparse
  11. import jieba
  12. import jieba.posseg as posseg
  13. import sys
  14. from collections import Counter
  15. from wordcloud import WordCloud
  16. def readfile(fn):
  17. f = open(fn)
  18. s = ''
  19. for line in f:
  20. s = s + line
  21. f.close()
  22. return s
  23. def cnt(words, parts):
  24. if not parts is None:
  25. S = set()
  26. pt = parts.split(',')
  27. for x in pt:
  28. S.add(x)
  29. w = []
  30. for x in words:
  31. if parts is None or x.flag in S:
  32. w.append(x.word)
  33. return Counter(w)
  34. def main():
  35. parser = argparse.ArgumentParser()
  36. parser.add_argument("-i", "--input", help = "input file name")
  37. parser.add_argument("-p", "--parts",
  38. help = "the parts you want of speech, seperated by comma")
  39. parser.add_argument("-j", "--jobs", type = int,
  40. help = "the number of parallel jobs")
  41. parser.add_argument("-t", "--top", type = int,
  42. help = "the number of words you want to show on the cloud")
  43. parser.add_argument("-o", "--output", help = "output image file name")
  44. parser.add_argument("-f", "--fontpath",
  45. help = "the path to the TTF/OTF fonts to use")
  46. parser.add_argument("-d", "--debug", type=bool, default=False,
  47. help = "if you want to dump debug information")
  48. parser.add_argument("-m", "--mask",
  49. help = "the path to the mask image")
  50. parser.add_argument("-c", "--contour", type = int, default = 0,
  51. help = "the width of contour if you want it")
  52. args = parser.parse_args()
  53. if not args.jobs is None:
  54. jieba.enable_parallel(args.jobs)
  55. if args.input is None:
  56. parser.print_help()
  57. return 1
  58. s = readfile(args.input)
  59. words = posseg.cut(s)
  60. c = cnt(words, args.parts)
  61. if not args.top is None:
  62. c = dict(c.most_common(args.top))
  63. if args.debug:
  64. print(c)
  65. mask = None
  66. if not args.mask is None:
  67. from PIL import Image
  68. import numpy as np
  69. mask = np.array(Image.open(args.mask))
  70. wc = WordCloud(font_path = args.fontpath,
  71. mask = mask,
  72. contour_width = args.contour,
  73. contour_color = "steelblue").generate_from_frequencies(c)
  74. wc.to_image().save(args.output)
  75. if __name__ == "__main__":
  76. sys.exit(main())