from nltk.corpus import stopwords
import numpy as np
import os
import pickle as pkl
from scipy.stats import spearmanr
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
from string import punctuation
import sys

DIR = './'
FLOAT = np.float32
STOP = set.union(set(stopwords.words('english')), set(punctuation))


def cosine(v1, v2):
  '''computes cosine similarity between rows
  Args:
    v1: 2-d numpy array
    v2: 2-d numpy array
  Returns:
    1-d numpy array with cosine similarities
  '''
  assert(v1.shape == v2.shape)
  v1 = normalize(v1)
  v2 = normalize(v2)
  return (v1*v2).sum(axis=1)


def load_vectors(vectorfile, dim=300, skip=False):
  '''loads word vectors from a text file
  Args:
    vectorfile: string; vector file name
    dim: int; dimensions of the vectors
    skip: boolean; whether or not to skip the first line (for word2vec)
  Returns:
    generator of (string, numpy array); word and its embedding
  '''
  with open(vectorfile, 'r') as f:
    for line in f:
      if skip:
        skip = False
      else:
        index = line.index(' ')
        word = line[:index]
        yield word, np.array([FLOAT(entry) for entry in line[index+1:].split()[:dim]])


def load_transform(Afile):
  '''loads the transform from a text file
  Args:
    Afile: string; transform file name
  Returns:
    numpy array
  '''
  with open(Afile, 'r') as f:
    return np.vstack([np.array([FLOAT(x) for x in line.split()]) for line in f])


def eval_crw(w2v, rarew2v):
  '''evaluates the given embeddings on the CRW dataset
  Args:
    w2v: {string: numpy array}; embeddings of non-rare words
    rarew2v: {string: numpy array}; embeddings of rare words
  Returns:
    float; Spearman correlation coefficient for the given embeddings
  '''
  dim = w2v[list(w2v.keys())[0]].shape[0]
  z = np.zeros(dim)
  with open(DIR+'CRW-562.txt', 'r') as f:
    lines = list(f)
  pairs = [l.split()[:2] for l in lines]
  scores = [float(l.split()[-1]) for l in lines]
  predscores = cosine(np.vstack([w2v.get(p[0], z) for p in pairs]),\
                      np.vstack([rarew2v.get(p[1], z) for p in pairs]))
  return spearmanr(scores, predscores)[0]


def get_weighted_context(line, rareword, w2v, weights=None, wndo2=5):
  '''computes the weighted sum of the words in a window around the rare word
  Args:
    line: string; context (sentence) containing the rare word
    rareword: string; rare word whose context embedding is to be computed
    w2v: {string: numpy array}; embeddings of non-rare words
    weight: {string: float}; weight for non-rare words. If None, computes unweighted sum
    wndo2: int; size of the half-window
  Returns:
    numpy array
  '''
  dim = w2v[list(w2v.keys())[0]].shape[0]
  z = np.zeros(dim)
  words = line.lower().split()
  index = words.index(rareword)
  if weights is None:
    return np.sum(w2v.get(words[i], z)
                  for i in range(max(0, index-wndo2),
                                 min(len(words), index+wndo2+1)))
  else:
    return np.sum(weights.get(words[i], 0) * w2v.get(words[i], z)
                  for i in range(max(0, index-wndo2),
                                 min(len(words), index+wndo2+1)))


def get_context_vectors(w2v, rarevocab, **kwargs):
  '''computes the context vectors for all contexts of all rare words
  Args:
    w2v: {string: numpy array}; embeddings of non-rare words
    rarew2v: {string: numpy array}; embeddings of rare words
    kwargs: passed to get_weighted_context
  Returns:
    {string: numpy array}; [255, dimension] array for all rare words
  '''
  dim = w2v[list(w2v.keys())[0]].shape[0]
  context_vectors = {}
  for word in rarevocab:
    sys.stdout.write('\rConstructing contexts: %s'%word + ' '*(20-len(word)))
    with open(DIR+'context/'+word+'.txt', 'r') as f:
      context_vectors[word] = np.vstack([get_weighted_context(line, word, w2v, **kwargs) for line in f])
  sys.stdout.write('\rConstructing contexts: Done' + ' '*16 + '\n')
  return context_vectors


def sif_weights(vocabfile, a=1E-5):
  '''computes sif weights for all words using word counts
  Args:
    vocabfile: string; filename storing words and their counts
    a: float; parameter used in SIF weights
  Returns:
    {string: float}; SIF weights for w = a/(a+fr[w])
  '''
  with open(vocabfile, 'r') as f:
    wordcount = {}
    for line in f:
      lastspace = line.rfind(' ')
      wordcount[str(line[:lastspace])] = int(line[lastspace+1:])
  totalcount = sum(wordcount.values())
  wordfreq = {word: count/totalcount for word, count in wordcount.items()}
  return {w: a/(a+f) for w, f in wordfreq.items()}


def print_scores(methods, scores):
  '''prints Spearman coefficient average and standard deviation
  Args:
    methods: [string]; name of all methods
    scores: 3-D numpy array; the axes correspond to (method, frequency, trial)
  Returns:
  '''
  mean = scores.mean(axis=0)
  std = scores.std(axis=0)
  print('Average Spearman correlation coefficient')
  print('freq\t'+'\t'.join(methods))
  for j in range(8):
    print(str(2**j)+'\t'+'\t'.join(['%.4f'%x for x in mean[:,j]]))
  print('Standard Deviation')
  print('freq\t'+'\t'.join(methods))
  for j in range(8):
    print(str(2**j)+'\t'+'\t'.join(['%.4f'%x for x in std[:,j]]))


def eval_baselines(trials=100):
  '''evaluates 5 baseline methods and a la carte on the CRW dataset.
  The methods evaluated are the following:
  avg: Unweighted sum of word vectors in the window
  abt: Removes the top principal component of the word vectors from avg
  sif: SIF-Weighted sum of word vectors in the window
  nsw: Sum of non stop-words in the window
  nsf: SIF-Weighted sum of word vectors in the window and remove the top component
  alc: a la carte embedding
  Prints the average and standard deviation of the Spearman's coefficient for the above methods
  Args:
    trials: int; Number of random trials
  Returns:
  '''
  # Load rare words vocabulary
  with open(DIR+'rarevocab.txt', 'r') as f:
    rarevocab = sorted([line.split()[0] for line in f])
  # Load word2vec embeddings trained on WWC.txt
  w2v = {w: v for w, v in load_vectors('vectors.txt', skip=True)}
  print('Vectors loaded')
  # Load the transformation learnt on WWC.txt
  A = load_transform('transform.txt')
  print('Transformation loaded')

  # Find the top singular vector of all word embeddings for "All-but-the-top"
  tsv = TruncatedSVD(n_components=1).fit(np.vstack(w2v.values())).components_
  # SIF weights for words. Using a = 1E-5
  sif = sif_weights(DIR+'WWC_vocab.txt')
  # No stop words weights.
  nsw = {w: int(not w in STOP) for w in w2v.keys()}
  # SIF + no stop words.
  nsf = {w: sif[w]*nsw.get(w,0) for w in sif.keys()}

  # Convert all contexts to vectors.
  avg_context_vectors = get_context_vectors(w2v, rarevocab)
  sif_context_vectors = get_context_vectors(w2v, rarevocab, weights=sif)
  nsw_context_vectors = get_context_vectors(w2v, rarevocab, weights=nsw)
  nsf_context_vectors = get_context_vectors(w2v, rarevocab, weights=nsf)
  # Run trials
  methods = ['avg', 'abt', 'sif', 'nsw', 'nsf', 'alc']
  scores = np.zeros(shape=(trials, 6, 8))
  for trial in range(trials):
    perm = np.random.permutation(255)
    for logfreq in range(8):
      freq = 2**logfreq
      # Test Average vectors.
      avgw2v = {word: np.sum(avg_context_vectors[word][perm[i]] for i in range(freq-1, 2*freq-1)) / freq for word in rarevocab}
      scores[trial][0][logfreq] = eval_crw(w2v, avgw2v)
      # Test All-but-the-top vectors.
      abtw2v = {w: v-tsv*np.dot(tsv, v) for w, v in avgw2v.items()}
      scores[trial][1][logfreq] = eval_crw(w2v, abtw2v)
      # Test SIF vectors
      sifw2v = {word: np.sum(sif_context_vectors[word][perm[i]] for i in range(freq-1, 2*freq-1)) / freq for word in rarevocab}
      scores[trial][2][logfreq] = eval_crw(w2v, sifw2v)
      # Test No stop words vectors.
      nsww2v = {word: np.sum(nsw_context_vectors[word][perm[i]] for i in range(freq-1, 2*freq-1)) / freq for word in rarevocab}
      scores[trial][3][logfreq] = eval_crw(w2v, nsww2v)
      # Test SIF + all-but-the-top vectors.
      nsfw2v = {w: v-tsv*np.dot(tsv, v) for w, v in sifw2v.items()}
      scores[trial][4][logfreq] = eval_crw(w2v, nsfw2v)
      # Test All-but-the-top vectors.
      alcw2v = {w: v.dot(A) for w, v in avgw2v.items()}
      scores[trial][5][logfreq] = eval_crw(w2v, alcw2v)
  print_scores(methods, scores)


if __name__ == "__main__": eval_baselines(trials=100)