from nltk.corpus import stopwords import numpy as np import os import pickle as pkl from scipy.stats import spearmanr from sklearn.decomposition import TruncatedSVD from sklearn.preprocessing import normalize from string import punctuation import sys DIR = './' FLOAT = np.float32 STOP = set.union(set(stopwords.words('english')), set(punctuation)) def cosine(v1, v2): '''computes cosine similarity between rows Args: v1: 2-d numpy array v2: 2-d numpy array Returns: 1-d numpy array with cosine similarities ''' assert(v1.shape == v2.shape) v1 = normalize(v1) v2 = normalize(v2) return (v1*v2).sum(axis=1) def load_vectors(vectorfile, dim=300, skip=False): '''loads word vectors from a text file Args: vectorfile: string; vector file name dim: int; dimensions of the vectors skip: boolean; whether or not to skip the first line (for word2vec) Returns: generator of (string, numpy array); word and its embedding ''' with open(vectorfile, 'r') as f: for line in f: if skip: skip = False else: index = line.index(' ') word = line[:index] yield word, np.array([FLOAT(entry) for entry in line[index+1:].split()[:dim]]) def load_transform(Afile): '''loads the transform from a text file Args: Afile: string; transform file name Returns: numpy array ''' with open(Afile, 'r') as f: return np.vstack([np.array([FLOAT(x) for x in line.split()]) for line in f]) def eval_crw(w2v, rarew2v): '''evaluates the given embeddings on the CRW dataset Args: w2v: {string: numpy array}; embeddings of non-rare words rarew2v: {string: numpy array}; embeddings of rare words Returns: float; Spearman correlation coefficient for the given embeddings ''' dim = w2v[list(w2v.keys())[0]].shape[0] z = np.zeros(dim) with open(DIR+'CRW-562.txt', 'r') as f: lines = list(f) pairs = [l.split()[:2] for l in lines] scores = [float(l.split()[-1]) for l in lines] predscores = cosine(np.vstack([w2v.get(p[0], z) for p in pairs]),\ np.vstack([rarew2v.get(p[1], z) for p in pairs])) return spearmanr(scores, predscores)[0] def get_weighted_context(line, rareword, w2v, weights=None, wndo2=5): '''computes the weighted sum of the words in a window around the rare word Args: line: string; context (sentence) containing the rare word rareword: string; rare word whose context embedding is to be computed w2v: {string: numpy array}; embeddings of non-rare words weight: {string: float}; weight for non-rare words. If None, computes unweighted sum wndo2: int; size of the half-window Returns: numpy array ''' dim = w2v[list(w2v.keys())[0]].shape[0] z = np.zeros(dim) words = line.lower().split() index = words.index(rareword) if weights is None: return np.sum(w2v.get(words[i], z) for i in range(max(0, index-wndo2), min(len(words), index+wndo2+1))) else: return np.sum(weights.get(words[i], 0) * w2v.get(words[i], z) for i in range(max(0, index-wndo2), min(len(words), index+wndo2+1))) def get_context_vectors(w2v, rarevocab, **kwargs): '''computes the context vectors for all contexts of all rare words Args: w2v: {string: numpy array}; embeddings of non-rare words rarew2v: {string: numpy array}; embeddings of rare words kwargs: passed to get_weighted_context Returns: {string: numpy array}; [255, dimension] array for all rare words ''' dim = w2v[list(w2v.keys())[0]].shape[0] context_vectors = {} for word in rarevocab: sys.stdout.write('\rConstructing contexts: %s'%word + ' '*(20-len(word))) with open(DIR+'context/'+word+'.txt', 'r') as f: context_vectors[word] = np.vstack([get_weighted_context(line, word, w2v, **kwargs) for line in f]) sys.stdout.write('\rConstructing contexts: Done' + ' '*16 + '\n') return context_vectors def sif_weights(vocabfile, a=1E-5): '''computes sif weights for all words using word counts Args: vocabfile: string; filename storing words and their counts a: float; parameter used in SIF weights Returns: {string: float}; SIF weights for w = a/(a+fr[w]) ''' with open(vocabfile, 'r') as f: wordcount = {} for line in f: lastspace = line.rfind(' ') wordcount[str(line[:lastspace])] = int(line[lastspace+1:]) totalcount = sum(wordcount.values()) wordfreq = {word: count/totalcount for word, count in wordcount.items()} return {w: a/(a+f) for w, f in wordfreq.items()} def print_scores(methods, scores): '''prints Spearman coefficient average and standard deviation Args: methods: [string]; name of all methods scores: 3-D numpy array; the axes correspond to (method, frequency, trial) Returns: ''' mean = scores.mean(axis=0) std = scores.std(axis=0) print('Average Spearman correlation coefficient') print('freq\t'+'\t'.join(methods)) for j in range(8): print(str(2**j)+'\t'+'\t'.join(['%.4f'%x for x in mean[:,j]])) print('Standard Deviation') print('freq\t'+'\t'.join(methods)) for j in range(8): print(str(2**j)+'\t'+'\t'.join(['%.4f'%x for x in std[:,j]])) def eval_baselines(trials=100): '''evaluates 5 baseline methods and a la carte on the CRW dataset. The methods evaluated are the following: avg: Unweighted sum of word vectors in the window abt: Removes the top principal component of the word vectors from avg sif: SIF-Weighted sum of word vectors in the window nsw: Sum of non stop-words in the window nsf: SIF-Weighted sum of word vectors in the window and remove the top component alc: a la carte embedding Prints the average and standard deviation of the Spearman's coefficient for the above methods Args: trials: int; Number of random trials Returns: ''' # Load rare words vocabulary with open(DIR+'rarevocab.txt', 'r') as f: rarevocab = sorted([line.split()[0] for line in f]) # Load word2vec embeddings trained on WWC.txt w2v = {w: v for w, v in load_vectors('vectors.txt', skip=True)} print('Vectors loaded') # Load the transformation learnt on WWC.txt A = load_transform('transform.txt') print('Transformation loaded') # Find the top singular vector of all word embeddings for "All-but-the-top" tsv = TruncatedSVD(n_components=1).fit(np.vstack(w2v.values())).components_ # SIF weights for words. Using a = 1E-5 sif = sif_weights(DIR+'WWC_vocab.txt') # No stop words weights. nsw = {w: int(not w in STOP) for w in w2v.keys()} # SIF + no stop words. nsf = {w: sif[w]*nsw.get(w,0) for w in sif.keys()} # Convert all contexts to vectors. avg_context_vectors = get_context_vectors(w2v, rarevocab) sif_context_vectors = get_context_vectors(w2v, rarevocab, weights=sif) nsw_context_vectors = get_context_vectors(w2v, rarevocab, weights=nsw) nsf_context_vectors = get_context_vectors(w2v, rarevocab, weights=nsf) # Run trials methods = ['avg', 'abt', 'sif', 'nsw', 'nsf', 'alc'] scores = np.zeros(shape=(trials, 6, 8)) for trial in range(trials): perm = np.random.permutation(255) for logfreq in range(8): freq = 2**logfreq # Test Average vectors. avgw2v = {word: np.sum(avg_context_vectors[word][perm[i]] for i in range(freq-1, 2*freq-1)) / freq for word in rarevocab} scores[trial][0][logfreq] = eval_crw(w2v, avgw2v) # Test All-but-the-top vectors. abtw2v = {w: v-tsv*np.dot(tsv, v) for w, v in avgw2v.items()} scores[trial][1][logfreq] = eval_crw(w2v, abtw2v) # Test SIF vectors sifw2v = {word: np.sum(sif_context_vectors[word][perm[i]] for i in range(freq-1, 2*freq-1)) / freq for word in rarevocab} scores[trial][2][logfreq] = eval_crw(w2v, sifw2v) # Test No stop words vectors. nsww2v = {word: np.sum(nsw_context_vectors[word][perm[i]] for i in range(freq-1, 2*freq-1)) / freq for word in rarevocab} scores[trial][3][logfreq] = eval_crw(w2v, nsww2v) # Test SIF + all-but-the-top vectors. nsfw2v = {w: v-tsv*np.dot(tsv, v) for w, v in sifw2v.items()} scores[trial][4][logfreq] = eval_crw(w2v, nsfw2v) # Test All-but-the-top vectors. alcw2v = {w: v.dot(A) for w, v in avgw2v.items()} scores[trial][5][logfreq] = eval_crw(w2v, alcw2v) print_scores(methods, scores) if __name__ == "__main__": eval_baselines(trials=100)