# Count and report word frequencies for a file given as the first argument
#
# Compare results with https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/PG/2006/04/1-10000
#
# Jan Kybic

import sys
import re
import time
import functools
import hashing
import random
import matplotlib.pyplot as plt

word_pattern=re.compile(r'[A-Za-z]+')

def read_words(filename):
   words=[]
   with open(filename,'rt') as f: # otevři textový soubor
     for line in f.readlines():   # čti řádku po řádce
       line_words=word_pattern.findall(line)          # najdi slova
       line_words=map(lambda x: x.lower(),line_words) # malá písmena
       words+=line_words                              # přidej do seznamu 
   return words

def word_counts_dictionary(words):
    """ Vrátí seznam dvojic slov a jejich frekvencí """
    counts={}
    for w in words:
      if w in counts:
        counts[w]+=1
      else:
        counts[w]=1
    return list(counts.items())

# implementace pomocí naší tabulky
def word_counts_hashtable(words):
    """ Vrátí seznam dvojic slov a jejich frekvencí """
    counts=hashing.Hashtable()
    for w in words:
      value=hashing.get(counts,w)
      if value is None:
        counts=hashing.put(counts,w,1)
      else:
        counts=hashing.put(counts,w,value+1)
    return hashing.items(counts)

  
import binary_search_tree as bst

# implementace pomocí vyhledávacího stromu
def word_counts_bst(words):
    """ Vrátí seznam dvojic slov a jejich frekvencí """
    counts=None
    for w in words:
      value=bst.get(counts,w)
      if value is None:
        counts=bst.put(counts,w,1)
      else:
        counts=bst.put(counts,w,value+1)
    return bst.items(counts)



  
  
def print_frequencies(counts,n=10):
    """ Vytiskne 'n' nečastěji použitých slov dle seznamu dvojit (slovo,frekvence) 'counts' """
    counts.sort(key=lambda x: x[1],reverse=True)     # setřiď od nejčastějšího
    nwords=functools.reduce(lambda acc,x: x[1]+acc,counts,0) # celkový počet slov
    for i in range(min(n,len(counts))):
      print("%10s %6.3f%%" % (counts[i][0],counts[i][1]/nwords*100.))

      
def word_frequencies(filename):
  """ Načte text ze souboru a vypíše nejfrekventovanější slov """  
  w=read_words(filename)
  c=word_counts_dictionary(w)
  print_frequencies(c,n=15)

def time_maps(filename):
  w=read_words(filename)
  nwords=len(w)
  ns=[50000,100000,200000,500000,1000000]
  #ns=[10000,20000,50000]
  algs=[word_counts_dictionary, word_counts_hashtable, word_counts_bst]
  alltimes=[]
  for alg in algs:
    algtimes=[]
    for n in ns:
        inp=[ w[random.randrange(nwords)] for i in range(n) ] # random input words
        t0=time.time()
        c=alg(inp)
        t=time.time()-t0
        print("Algorithm ",alg.__name__, " n=",n, " time=",t)
        algtimes+=[t]
    alltimes+=[algtimes]
  plt.figure(1)
  for i in range(len(algs)):
      plt.plot(ns,alltimes[i],marker='o',linewidth=3,label=algs[i].__name__)
  plt.legend(loc='upper left')
  plt.xlabel('N')
  plt.ylabel('time [s]')
  plt.savefig("time_maps.pdf")
  plt.figure(2)
  for i in range(len(algs)):
     plt.loglog(ns,alltimes[i],marker='o',linewidth=3,label=algs[i].__name__)
  plt.legend(loc='upper left')
  plt.xlabel('N')
  plt.ylabel('time [s]')
  plt.savefig("time_maps_log.pdf")

        
if __name__=="__main__":
  word_frequencies(sys.argv[1])
  #time_maps(sys.argv[1])
