wordfreq.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
  1. #!/usr/bin/env python3
  2. #------------------------------------------------------------------------#
  3. # "THE BEER-WARE LICENSE" (Revision 42): #
  4. # <xry111@mengyan1223.wang> wrote this file. As long as you retain this #
  5. # notice you can do whatever you want with this stuff. If we meet some #
  6. # day, and you think this stuff is worth it, you can buy me a beer in #
  7. # return. #
  8. # Xi Ruoyao #
  9. #------------------------------------------------------------------------#
  10. import argparse
  11. import jieba
  12. import jieba.posseg as posseg
  13. import sys
  14. from collections import Counter
  15. from wordcloud import WordCloud
  16. def readfile(fn):
  17. f = open(fn)
  18. s = ''
  19. for line in f:
  20. s = s + line
  21. f.close()
  22. return s
  23. def cnt(words, parts):
  24. if not parts is None:
  25. S = set()
  26. pt = parts.split(',')
  27. for x in pt:
  28. S.add(x)
  29. w = []
  30. for x in words:
  31. if parts is None or x.flag in S:
  32. w.append(x.word)
  33. return Counter(w)
  34. def main():
  35. parser = argparse.ArgumentParser()
  36. parser.add_argument("-i", "--input", help = "input file name")
  37. parser.add_argument("-p", "--parts",
  38. help = "the parts you want of speech, seperated by comma")
  39. parser.add_argument("-j", "--jobs", type = int,
  40. help = "the number of parallel jobs")
  41. parser.add_argument("-t", "--top", type = int,
  42. help = "the number of words you want to show on the cloud")
  43. parser.add_argument("-o", "--output", help = "output image file name")
  44. parser.add_argument("-f", "--fontpath",
  45. help = "the path to the TTF/OTF fonts to use")
  46. parser.add_argument("-d", "--debug", type=bool, default=False,
  47. help = "if you want to dump debug information")
  48. parser.add_argument("-m", "--mask",
  49. help = "the path to the mask image")
  50. parser.add_argument("-c", "--contour", type = int, default = 0,
  51. help = "the width of contour if you want it")
  52. parser.add_argument("-x", "--width", type = int, default = 400,
  53. help = "the width of canvas")
  54. parser.add_argument("-y", "--height", type = int, default = 200,
  55. help = "the height of canvas")
  56. parser.add_argument("-r", "--prefer-horizontal", type = float,
  57. default = 0.9,
  58. help = "the ratio of times to try horizontal fitting")
  59. args = parser.parse_args()
  60. if not args.jobs is None:
  61. jieba.enable_parallel(args.jobs)
  62. if args.input is None:
  63. parser.print_help()
  64. return 1
  65. s = readfile(args.input)
  66. words = posseg.cut(s)
  67. c = cnt(words, args.parts)
  68. if not args.top is None:
  69. c = dict(c.most_common(args.top))
  70. if args.debug:
  71. print(c)
  72. # recognize .csv output and skip wordcloud
  73. if args.output[-4:] == '.csv':
  74. with open(args.output, mode="w") as ofile:
  75. for i in c:
  76. ofile.write(i + ',' + str(c[i]) + '\n')
  77. return
  78. mask = None
  79. if not args.mask is None:
  80. from PIL import Image
  81. import numpy as np
  82. # the mask need to be scaled
  83. img = Image.open(args.mask).resize((args.width, args.height))
  84. mask = np.array(img)
  85. wordcloud = WordCloud(font_path = args.fontpath,
  86. mask = mask,
  87. contour_width = args.contour,
  88. contour_color = "steelblue",
  89. width = args.width,
  90. height = args.height,
  91. prefer_horizontal = args.prefer_horizontal)
  92. wc = wordcloud.generate_from_frequencies(c)
  93. wc.to_image().save(args.output)
  94. if __name__ == "__main__":
  95. sys.exit(main())