"""
License: [MIT]

Copyright 2018 Chris Culy

Permission is hereby granted, free of charge, to any person obtaining a copy of this 
software and associated documentation files (the "Software"), to deal in the Software 
without restriction, including without limitation the rights to use, copy, modify, merge, 
publish, distribute, sublicense, and/or sell copies of the Software, and to permit 
persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or 
substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE 
FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
DEALINGS IN THE SOFTWARE.

"""

from gensim import models
from gensim.models.fasttext import FastText
import gensim.downloader as gensim_data
from gensim import utils

from matplotlib import pyplot as plt
import matplotlib
import numpy as np
import pandas as pd

import math
#import statsmodels.api as sm
from sklearn.neighbors import KernelDensity
from sklearn import linear_model
from sklearn.model_selection import GridSearchCV

from scipy import stats
from scipy.stats.stats import spearmanr
from scipy.stats.stats import pearsonr


from nltk.corpus import stopwords

from collections import Counter, defaultdict
import random

import csv

class Sampler(object):
	
	"""
	This is an abstract class.
	
	subclasses need to set up:
		self.counts: (Counter of words)
	
	"""
	
	def __init__(self, omit_stopwords= False, min_count=1):
		"""
		preprocess for stopwords and min_count if necessary
		set up deciles, percentile words, both by rank and by count
		"""
		
		if omit_stopwords:
			stops = stopwords.words('english')
		
		#need to remove low counts and maybe stopwords from counter, but can't update counter while iterating through it
		if min_count > 1:
			tmpC = Counter(self.counts)
			for c in tmpC.items():
				if c[1] < min_count:
					del self.counts[c[0]]
		if omit_stopwords:
			for sw in stops:
				del self.counts[sw]
			

		self.words = [w[0] for w in self.counts.most_common(None)]
		
		
		self.ntypes = len(self.words)

		#set up count info
		
		self.bins = defaultdict(list)
		for c in self.counts.most_common(None):
			self.bins[c[1]].append(c[0])
		
		self.sorted_bin_keys = sorted(list(self.bins.keys()))

		info = {}
		
		info['size'] = len(self.bins)
		info['idxs'] = np.array(range(info['size']))
	
		info['percentile_words'] = [ random.choice(
			self.bins[
				self.sorted_bin_keys[
					int(round(np.percentile(info['idxs'],p)))]])  
									for p in range(100,-1,-1) ]
			 
		self.info = info
		

	def get_percentile(self,word):
		"""
		return the frequency percentile of word
		"""
		
		c = self.counts[word]
		idx = self.sorted_bin_keys.index(c)
		nkeys = len(self.sorted_bin_keys)
		return int(round(100*idx/nkeys))
		
	def words_in_range(self,lower_percentile,upper_percentile):
		"""
		return all the words in lower_percentile,upper_percentile
		"""
		
		lower_idx = int(round(np.percentile(self.info['idxs'],lower_percentile)))
		upper_idx = int(round(np.percentile(self.info['idxs'],upper_percentile)))
		
		return [w for idx in range(lower_idx,upper_idx+1) for w in self.bins[self.sorted_bin_keys[idx]]]
		
		
	def uniform_sample_sims(self,vecs,samples,lower_percentile=0,upper_percentile=100,only_different=True):
		"""
		select #samples of pairs of words uniformly from range lower_percentile to upper_percentile
		
		return list of [count1,count2,sim(wd1,wd2)]
		"""
		
		words = self.words_in_range(lower_percentile,upper_percentile)
		what = []
		for _ in range(samples):
			wd1 = random.choice(words)
			wd2 = random.choice(words)
			if only_different and len(words) > 1:
				while wd2 == wd1:
					wd2 = random.choice(words)
			
			info = [self.counts.get(wd1), self.counts.get(wd2), vecs.similarity(wd1,wd2)]
			what.append(info)
			
		return what
	
	def uniform_sample_sims_ranges(self,vecs,samples,range1,range2,only_different=True):
		"""
		select #samples of pairs of words uniformly from each range1, range2 [lower_percentile upper_percentile]
		
		return list of [count1,count2,sim(wd1,wd2)]
		"""
		
		words1 = self.words_in_range(range1[0],range1[1])
		words2 = self.words_in_range(range2[0],range2[1])
		what = []
		for _ in range(samples):
			wd1 = random.choice(words1)
			wd2 = random.choice(words2)
			if only_different and len(words2) > 1:
				while wd2 == wd1:
					wd2 = random.choice(words2)
			
			info = [self.counts.get(wd1), self.counts.get(wd2), vecs.similarity(wd1,wd2)]
			what.append(info)
			
		return what
	
class CorpusSampler(Sampler):
	"""
	read corpus, one sentence per line
	
	shuffle words if desired
	"""
	
	def __init__(self,fname, shuffle=False, omit_stopwords=False, min_count=1):
			
		#Assume one sentence per line
		with open(fname) as f:
			self.sents = [line.strip().split() for line in f.readlines()]			 
				
		if shuffle:
			self.sents = self.do_shuffle(self.sents)
		
		self.counts = Counter([t for s in self.sents for t in s])
		
		super().__init__(omit_stopwords=omit_stopwords, min_count=min_count)
		
	def do_shuffle(self,sents, min_sent_len=2, max_sent_lent=40):
		"""
		sents is list of lists of tokens
		
		we shuffle *all* the tokens, and make new "sentences"
		
		return list of new "sentences"		  
		"""
		
		
		toks = [t for s in sents for t in s]
		random.shuffle(toks)
		
		#now make new "sentences"
		what = []
		first = 0
		last = random.randrange(min_sent_len,max_sent_lent)
		while last < len(toks):
			s = toks[first:last]
			what.append(s)
			first = last
			last = first + random.randrange(5,40)
		s = toks[first:]
		what.append(s)
		
		return what

class VectorSampler(Sampler):
	"""
	takes genism KeyedVectors as basis. 

	This assumes that the vectors are sorted by frequency, which isn't true in general
		It is true for ones made by gensim (?)
		Not for hyperwords
		Seems to be true for GoogleNews

	Estimate them from an example of frequency of one word, using its rank and Zipfian estimate. 
	We'll also only use single words in this case, since we can't estimate right when phrases are included.
	"""
	
	def __init__(self,vecs,word,freq,phrase_separator=None,omit_stopwords=False,min_count=1):
		
		"""
		Estimating frequencies
		
		i is rank i
		
		Zipf is freq(i) ~ k/i 
		=> k = i * freq(i)
		
		NB: a medium-high rank word would work best, but we'll work with what we have
		"""
		
		if phrase_separator is not None:
			wds = [w for w in vecs.vocab if phrase_separator not in w]
		else:
			wds = vecs.vocab.keys()
		 
		rank = vecs.vocab.get(word).index + 1
		k = rank * freq

		self.counts = Counter(dict([(w,max(1,int(round(k/(1+vecs.vocab.get(w).index))))) for w in wds]))
		
		super().__init__(omit_stopwords=omit_stopwords, min_count=min_count)
		
###########
class Setup(object):
	"""
	Various methods for setting up vectors and corpora
	"""
	
	#set static variables -- put your paths here
	"
	local_glove_vecs_dir = '/YOUR/PATH/'
	ppmi_dir = '/YOUR/PATH/'
	
	gname = '/YOUR/PATH/GoogleNews-vectors-negative300.bin.gz' #Google News Vectors
	#stanford_glove_dir = '~/gensim-data/' #now assuming that we've used gensim to download glove
	ftename = '/YOUR/PATH/wiki-news-300d-1M.vec' #FastText English vectors
	
	
	
	@staticmethod
	def make_vecs(model,sents,min_count,window,dims,centroid=False, init_sims=True):
		"""
		model is word2vec, sgns, cbow, or FastText. (word2vec is the same as sgns, i.e. skip ngram)

		if centroid, iterate some number of times to find the centroid
		
		if init_sims, normalize vectors, discard original info

		return keyedvectors from model
		"""

		if model == "cbow":
			sg = 0
		else:
			sg = 1 #skip ngram
			
		(workers,downsample) = (2,0.001)

		if model == "word2vec" or model == 'sgns' or model == 'cbow':
			m = models.Word2Vec
		elif model == "FastText":
			m = FastText #NB: this is the default skipgram version, since we don't pass sgns/cbow as parameter

		if centroid:
			topn = 10
			runs = 20
			wds = ['the','woman','since','eager'] #arbitrary for English, but spread out in heartd
			wmodel,_,_ = iterate_centroid(sents,wds, params=(min_count,window,dims,workers,downsample), 
										  sg=sg, n=topn, runs=20, threshold=0.99, method=model, show_progress=True)
		else:
			wmodel = m(sents, sg=sg, min_count=min_count, window=window, sample=downsample, size=dims, workers=2)

		if init_sims:
			wmodel.wv.init_sims(True) 

		return wmodel.wv

	@staticmethod
	def setup_Glove(text,win,dims,min_count,xmax,init_sims=True):
		"""
		load pre-made vectors, create by Glove, and already converted to word2vec format
		if init_sims, normalize vectors, discard original info

		return vectors
		"""
		
		vecfile = Setup.local_glove_vecs_dir + 'vecs-xmax%d/%s-win%d-dim%d-thr%d.vecs' % (xmax,text,win,dims,min_count)
		wv = models.KeyedVectors.load_word2vec_format(vecfile, binary=False)
		if init_sims:
			wv.init_sims(True) #normalize, discard original info

		return wv

	@staticmethod
	def setup_ppmi_svd(text,win,dims,min_count, init_sims=True):
		"""
		load pre-made vectors
		if init_sims, normalize vectors, discard original info

		return vectors
		"""

		svdfile = Setup.ppmi_dir + '%s-win%d-dim%d-thr%d/vectors.txt' % (text,win,dims,min_count)
		tmpF = 'tmp.vecs'
		Setup.svd_to_word2vec(svdfile,tmpF)
		wv = models.KeyedVectors.load_word2vec_format(tmpF, binary=False)
		if init_sims:
			wv.init_sims(True) #normalize, discard original info

		return wv

	@staticmethod
	def svd_to_word2vec(wvfile,newfile):
		"""
		convert a hyperwords vectors in text format to the word2vec format: just add as the first line the number of lines followed by the dimension, separated by a space
		then save back out as newfile
		"""

		with open(wvfile) as f:
			lines = f.readlines()
			n = len(lines)
			dim = len(lines[0].split()) - 1

		with open(newfile, 'w') as f:
			f.write("%d %d\n" % (n,dim))
			for x in lines:
				f.write(x)

	@staticmethod
	def setup_GoogleNews(init_sims=True):
		"""
		load Google News vectors and create corpus

		if init_sims, normalize vectors, discard original info

		return vecs,sampler
		"""

		#this takes a few minutes

		gvecs = models.KeyedVectors.load_word2vec_format(Setup.gname, binary=True)
		if init_sims:
			gvecs.init_sims(True)

		word = 'RAFFAELE'
		freq = 5
		gsampler = VectorSampler(gvecs,word,freq,phrase_separator='_')

		return gvecs, gsampler

	@staticmethod
	def setup_Glove_pre(dims, init_sims=True):
		"""
		load Glove vectors for wiki + gigaword 

		dims is {50,100,200,300}

		Vocabulary is 6.000.000.000

		To crudely estimate frequencies, use "dog" in Google ngram viewr for 2000

		dog is 0.0040587344 % 

		So estimate dog frequency in Glove vectors as

		0.000040587344 * 6B = 243,524

		NB: Glove also includes lots of other stuff, including multiwword tokens

		if init_sims, normalize vectors, discard original info

		e.g. glove_vecs,glove_sampler = setup_Glove_pre(50)
		"""

		#name = Setup.stanford_glove_dir + "glove-wiki-gigaword-%d" % dims
		name = 'glove-wiki-gigaword-%d' % dims #this assumes we've used genim to download glove ...
		vecs = gensim_data.load(name)
		if init_sims:
			vecs.init_sims(True) #normalize, discard original info

		word = 'dog'
		freq = 243524
		sampler = VectorSampler(vecs,word,freq,phrase_separator='_')

		return vecs, sampler

	@staticmethod
	def setup_FT_English(init_sims=True):
		"""
		load vectors from fasttext.cc
		
		dims is 300
		
		Vocabulary is 16B
		
		We'll use the same crude frequency estimation as for Glove:
		
		"dog" in Google ngram viewr for 2000 is 0.0040587344 % 
		
		So estimate dog frequency in FastText vectors as

		0.000040587344 * 16B = 649,398
		
		
		Alternatively, since over half the corpus is from Wikipedia, we could estimate from the first 1B words of Wikipedia
		
		egrep -io '\Wdog\W' fil9 | wc
		6217    6217   37302
		
		wc fil9
		0 124301826 713069767 fil9
		
		6217/124301826 = 0.00005001535537
		
		0.00005001535537 * 16B = 800,246

		if init_sims, normalize vectors, discard original info
		
		"""
		
		vecs = models.KeyedVectors.load_word2vec_format(Setup.ftename, binary=False)
		if init_sims:
			vecs.init_sims(True) #normalize, discard original info

		word = 'dog'
		#freq = 649398
		freq = 800246
		sampler = VectorSampler(vecs,word,freq,phrase_separator=None)

		return vecs, sampler
	
	@staticmethod
	def setup_FT_Italian(init_sims=True):
		"""
		load vectors from fasttext.cc

		if init_sims, normalize vectors, discard original info

		return vecs,sampler
		"""

		vecs = models.KeyedVectors.load_word2vec_format(Setup.itname, binary=False)
		if init_sims:
			vecs.init_sims(True) #normalize, discard original info

		word = 'metallo-organici'
		freq = 5
		sampler = VectorSampler(vecs,word,freq,phrase_separator=None)

		return vecs, sampler
	
	@staticmethod
	def make_random_vecs(word_counter,dims, fname, init_sims=True):
		"""
		create a random word vector model with dims dimensions
		each vector has uniformly random values
		the frequencies of the words comes from the Counter word_counter
	
		save as word2vec format in fname
	
		if init_sims, normalize vectors, discard original info
		return the vectors
		"""
	
		vsize = len(word_counter)
		with open(fname, 'w') as f:
			f.write("%d %d\n" % (vsize,dims))
		
			for w in word_counter.most_common(None):
				v = [w[0]] + [str(random.uniform(-1,1)) for _ in range(dims)]
				f.write(' '.join(v) + '\n')
			
		vecs = models.KeyedVectors.load_word2vec_format(fname, binary=False)
		if init_sims:
			vecs.init_sims(True)
	
		return vecs 
		
	@staticmethod
	def make_standard_sampler_and_vecs(name,win,dims,min_count,omit_stopwords=False,shuffle=False,xmax=10, init_sims=True):
		"""
		return dict of the corpus sampler and our 4 standard embeddings: sgns, FastText, glove, ppmi. Keys:
			sampler
			sgns
			ft
			glove
			ppmi
		"""
	
		what = dict()
		if name == 'vfair':
			fname = 'vanity_fair_pg599.txt-sents-clean.txt'
		elif name == 'heartd':
			fname = 'heart_darkness_219-0.txt.clean'
		else:
			raise ValueError('Unknown corpus: %s' % name)
		
		sampler = CorpusSampler(fname, min_count=min_count, omit_stopwords=omit_stopwords, shuffle=shuffle)
		what['sampler'] = sampler
	
		#for Glove
		sampler.counts.update(['<unk>'])
	
		what['sgns'] = Setup.make_vecs('word2vec', sampler.sents, min_count,win,dims,init_sims=init_sims)
		what['ft'] = Setup.make_vecs('FastText', sampler.sents, min_count,win,dims,init_sims=init_sims)
		what['glove'] = Setup.setup_Glove(name,win,dims,min_count,xmax,init_sims=init_sims)
		what['ppmi'] = Setup.setup_ppmi_svd(name,win,dims,min_count,init_sims=init_sims)
	
		return what

class Plotting(object):
	"""
	Various plotting functions
	"""
	
	@staticmethod
	def plot_histogram(sims,name,histogram=False,rug=True,bandwidth=None):
		"""
		
		also with Gaussian kernel estimation the similarities
		
		optionally with histogram
		optionally with rug plot
		
		if bandwidth is None, than calculate it using GridSearchCV
		NB: for large samples it's too slow to calculate
		"""
		
		d = np.array([s[2] for s in sims]).reshape(-1,1) #my sims
			
		fig, ax = plt.subplots()
	
		if histogram:
			n, bins, patches = ax.hist(d, density=True, color='orange')
	
		ax.spines["top"].set_visible(False)
		ax.spines["right"].set_visible(False)

		# kde based on 
		# http://scikit-learn.org/stable/auto_examples/neighbors/plot_digits_kde_sampling.html
		# and https://jakevdp.github.io/PythonDataScienceHandbook/05.13-kernel-density-estimation.html (for explicit bandwidth)
		
		# use grid search cross-validation to optimize the bandwidth
		if bandwidth is None:
			params = {'bandwidth': np.logspace(-1, 1, 20)}
			grid = GridSearchCV(KernelDensity(), params)
			grid.fit(d)
			#print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth)) #I was using 0.1, which is what this gives me anyway
			# use the best estimator to compute the kernel density estimate
			kde = grid.best_estimator_
		else:
			kde = KernelDensity(bandwidth=bandwidth, kernel='gaussian')
		

		# instantiate and fit the KDE model
		kde.fit(d)
	
		x_d = np.linspace(-1, 1, 1000)

		# score_samples returns the log of the probability density
		logprob = kde.score_samples(x_d[:, None])
		plt.plot(x_d,np.exp(logprob), color='purple', linewidth=2)
		
		if rug:
			#rugplot
			plt.plot(d, np.zeros(len(d)), '|', color='k')
	
		plt.axvline(x=0,color='gray')
		plt.xlim((-1.0,1.0))
	
		desc = stats.describe(d)
	
		title = 'Sims for %s\nMean: %0.4f, Variance: %0.4f\nSkewness: %0.4f, Kurtosis: %0.4f' % (name,desc.mean,desc.variance,desc.skewness,desc.kurtosis)
	
		plt.xlabel('sims')
		plt.ylabel('density')
		plt.title(title)
		plt.show()

	@staticmethod
	def plot_sims_by_freq_reln(sims,name,logx=True,lower_y=-0.5):
		"""
		sims are list of (count1, count2, sim)
		if logx, use a log scale for x axis, except for difference
		if diff_logx, use log scale also for difference
	
		plot the sims by their frequency relations:
	
			higher	 use only higher frequency
			lower	 use only lower frequency
			diff = higher - lower  (shows absolutely similar and very different frequencies)
			ratio = higher/lower	(a different view for similar/different frequence)
		"""
	
		svals = [x[2] for x in sims]
	
		fig, axes = plt.subplots(2,2, figsize=(12, 8), dpi=80, facecolor='w', edgecolor='k')
		
		ax = axes[0,0]
		xs = [max(x[0],x[1]) for x in sims] 
		ax.scatter(xs, svals, color='orange')
		if logx:
			ax.set_xscale('log')
			ax.set_xlabel('log frequency')
		else:
			ax.set_xlabel('frequency')
		ax.set_ylim(lower_y,1.1)
		ax.set_ylabel('sim')
		ax.set_title('More frequent of pair')
	

	
		ax = axes[0,1]
		xs = [min(x[0],x[1]) for x in sims]	   
		ax.scatter(xs, svals, color='orange')
		if logx:
			ax.set_xscale('log')
			ax.set_xlabel('log frequency')
		else:
			ax.set_xlabel('frequency')
		ax.set_ylim(lower_y,1.1)
		ax.set_ylabel('sim')
		ax.set_title('Less frequent of pair')
	


		ax = axes[1,0]
		#for some reason, just using log scale didn't work right for difference?
		if logx: 
			xs = [math.log(0.1 + abs(x[0]-x[1])) for x in sims] #add 0.1 so we have positive value	 
			ax.set_xlabel('log difference')
		else:
			xs = [abs(x[0]-x[1]) for x in sims]	   
			ax.set_xlabel('difference')
		ax.scatter(xs, svals, color='purple')
		ax.set_ylim(lower_y,1.1)
		ax.set_ylabel('sim')
		ax.set_title('Difference between pair')

		ax = axes[1,1]
		xs = [max(x[0]/x[1], x[1]/x[0]) for x in sims]	  
		ax.scatter(xs, svals, color='purple')
		if logx:
			ax.set_xscale('log')
			ax.set_xlabel('log ratio')
		else:
			ax.set_xlabel('ratio')
		ax.set_ylim(lower_y,1.1)
		ax.set_ylabel('sim')
		ax.set_title('Ratio of pair')

		
		plt.suptitle("Sims by frequency relation %s" % name)

		#fig.tight_layout() #this stomped on main title
		plt.subplots_adjust(hspace=0.3)
		plt.show()
		

	@staticmethod
	def compare_methods_range(combo,methods=True,lowerP=0,upperP=100, lower_y=-0.7, name='', samples=1000, histogram_only=True, histogram=False, rug=True):
		"""
		plot_histogram and plot_sims_by_freq_reln for each of the methods that are specified. if methods is None, then use all methods
			with uniform sampling  in percentiles [lowerP,upperP]
		lower_y is for y axis
				
		"""
		
		
		sampler = combo['sampler']
		
		if methods is True:
			methods = ['sgns','ft','glove','ppmi']
		
		
		for m in methods:
			usims = sampler.uniform_sample_sims(combo[m], samples, lower_percentile=lowerP, upper_percentile=upperP)
			title = "%s percentiles %d-%d, %s" % (name,lowerP,upperP, m)
			Plotting.plot_histogram(usims,title,histogram=histogram,rug=rug)
			if not histogram_only:
				Plotting.plot_sims_by_freq_reln(usims,title,lower_y=lower_y)
	
	@staticmethod
	def show_range_comparison(sampler,vecs,name,samples=1000,step=5,full_range=True):
		"""
		heatmap of sims by frequency percentiles
		"""

		d = []
		X = []
		y = []
		
		rng = range(0,100-step+1,step)
		for i in rng:
			row = []
			for j in rng:
				rsims = sampler.uniform_sample_sims_ranges(vecs,samples,range1=(i,i+step),range2=(j,j+step))
				m = stats.describe([x[2] for x in rsims]).mean
				row.append(m)
				X.append([i,j])
				y.append(m)
			d.append(row)

		lm = linear_model.LinearRegression()
		model = lm.fit(X,y)
		

		fig, ax = plt.subplots()
		fig.set_size_inches(8, 8)

		if full_range:
			heatmap = ax.imshow(d, cmap=matplotlib.cm.Oranges,interpolation='nearest',
								vmin=-1,vmax=1,
								origin='lower',
								extent=[0,100,0,100]) #(left, right, bottom, top)
		else:
			heatmap = ax.imshow(d, cmap=matplotlib.cm.Oranges,interpolation='nearest',
								origin='lower',
								extent=[0,100,0,100]) #(left, right, bottom, top)

		nticks = int(100/step) +1
		ticks = [x*step for x in range(nticks)]
		ax.set_xticks(ticks, minor=False)
		ax.set_yticks(ticks, minor=False)

		fig.colorbar(heatmap)
		plt.suptitle("Mean similarity by percentiles of compared words: %s\n$R^2$=%0.4f, $coef_1$=%0.4f, $coef_2$=%0.4f" % (name,lm.score(X,y), lm.coef_[0], lm.coef_[1])) #lm.intercept_
		plt.show()


	@staticmethod
	def compare_range_comparisons(combo,name,samples=1000,step=5,full_range=True):
		"""
		show the range comparison for all of the items in combo
		"""

		Utils.compare_methods(combo,name,Plotting.show_range_comparison,samples=samples,step=step,full_range=full_range)
		
	@staticmethod
	def rank_ranges(sampler,vecs,range1,range2,include_reverse=False,samples=1000):
		"""
		sample words in range1 and range2 and return the relative range of wd2 w.r.t wd1 
		(i.e. range1 is the reference)
	
		if include_reverse, return a second list with the relative range of wd1 w.r.t. wd2
	
		"""

		words1 = sampler.words_in_range(range1[0],range1[1])
		words2 = sampler.words_in_range(range2[0],range2[1])
	
		what = []
		what_rev = []
		for _ in range(samples):
			wd1 = random.choice(words1)
			wd2 = random.choice(words2)
			if wd2 == wd1:
				if len(words2) > 1:
					while wd2 == wd1:
						wd2 = random.choice(words2)
				else:
					print("range2 has only 1 word")

			what.append(vecs.rank(wd1,wd2))
			if include_reverse:
				what_rev.append(vecs.rank(wd2,wd1))

		if include_reverse:
			return what, what_rev
		else:
			return what
	
	@staticmethod
	def show_relrank_comparison(sampler,vecs,name,samples=100,step=5,full_range=True):
		"""
		heatmap of average relative reanks by frequency percentiles
	
		like with the sims, but using relative rank instead
		NB: similarity is symmetric but rank isn't
		
		Also do scatterplots of reference, comparison vs frequency, and compute Pearson and p
		"""
	
		nwds = len(sampler.counts)
		d = []
		X = []
		X2 = []
		X3 = []
		y = []
		rng = range(0,100-step+1,step)
		#columns are ranks w.r.t. the reference
		for j in rng: 
			row = []
			for i in rng: 
				ranks = Plotting.rank_ranges(sampler,vecs,range1=(i,i+step),range2=(j,j+step),samples=samples)
				m = stats.describe(ranks).mean/nwds
				row.append(m)
				X.append([i,j])
				X2.append(i)
				X3.append(j)
				y.append(m)
			d.append(row)

		lm = linear_model.LinearRegression()
		model = lm.fit(X,y)

		#fig, axes = plt.subplots(nrows=2,ncols=2)
		fig, ax = plt.subplots()
		fig.set_size_inches(8, 6)

#		plt.subplot(2,2,1)
#		ax = plt.gca()
		#ax = axes[0]
		if full_range:
			heatmap = ax.imshow(d, cmap=matplotlib.cm.Oranges_r,interpolation='nearest',
								vmin=0,vmax=1,
								origin='lower',
								extent=[0,100,0,100]) #(left, right, bottom, top)
		else:
			heatmap = ax.imshow(d, cmap=matplotlib.cm.Oranges_r,interpolation='nearest',
								origin='lower',
								extent=[0,100,0,100]) #(left, right, bottom, top)
	
		nticks = int(100/step) +1
		ticks = [x*step for x in range(nticks)]
		ax.set_xticks(ticks, minor=False)
		ax.set_yticks(ticks, minor=False)
		ax.set_xlabel('Reference percentiles')
		ax.set_ylabel('Comparison percentiles')

		cbar = fig.colorbar(heatmap, ax=ax)
		cbar.ax.invert_yaxis()
		
		plt.suptitle("Mean relative ranks by percentile: %s\n(darker is closer)\n$R^2$=%0.4f, Ref coef=%0.4f, Comp coef=%0.4f" % (name,lm.score(X,y), lm.coef_[0], lm.coef_[1])) #lm.intercept_

		plt.show()
		
		
		#scatter plots of reference and comparison vs percentile
		
		fig, axes = plt.subplots(nrows=1,ncols=2)
		fig.set_size_inches(10, 5)
		#plt.subplot(2,2,3)
		#ax = plt.gca()

		ax = axes[0]
		
		ax.scatter(X2,y,color='purple')
		ax.invert_yaxis() #so that ranks closer to 0 are higher
		ax.set_xlabel('percentile')
		ax.set_ylabel('mean relative rank')
		
		pearson = pearsonr(X2,y)
		ax.set_title('Reference items:\nPearson r = %0.4f, p = %0.4f' % pearson)

		#plt.subplot(2,2,4)
		#ax = plt.gca()
		ax = axes[1]

		ax.scatter(X3,y,color='purple')
		ax.invert_yaxis() #so that ranks closer to 0 are higher
		ax.set_xlabel('percentile')
		ax.set_ylabel('mean relative rank')
		
		pearson = pearsonr(X3,y)
		ax.set_title('Comparison items:\nPearson r = %0.4f, p = %0.4f' % pearson)



		#axes[0, 1].axis('off') #hide empty top right
		#plt.subplots_adjust(hspace=0.3)
	
		#plt.suptitle("Mean relative ranks by percentile: %s\n(darker is closer)" % name)
		plt.show()
	
	@staticmethod	
	def compare_relrank_comparisons(combo,name,samples=100,step=5, full_range=False):
		"""
		show the relranks comparison for all of the items in combo
		"""
		
		Utils.compare_methods(combo,name,Plotting.show_relrank_comparison,samples=samples,step=step,full_range=full_range)
		

	@staticmethod
	def show_recip_rank_comparison(sampler,vecs,name,samples=100,step=5,full_range=True):
		"""
		heatmap of average diff of rank(wd1,wd2) and rank(wd2,wd1) by frequency percentiles
	
		like with the relative rank
		"""
	
		nwds = len(sampler.counts)
		d = []
		X = []
		X2 = []
		y = []
		rng = range(0,100-step+1,step)
		for j in rng: 
			row = []
			for i in rng: 
				ranks1,ranks2 = Plotting.rank_ranges(sampler,vecs,range1=(i,i+step),range2=(j,j+step),samples=samples,include_reverse=True)
				ranks = abs(np.array(ranks1)-np.array(ranks2))
				m = stats.describe(ranks).mean/nwds
				row.append(m)
				X.append([i,j])
				X2.append(abs(i-j))
				y.append(m)
			d.append(row)

		lm = linear_model.LinearRegression()
		model = lm.fit(X,y)
		pearson = pearsonr(X2,y)

		#fig, axes = plt.subplots(nrows=2, ncols=1)
		fig, ax = plt.subplots()
		fig.set_size_inches(8, 6)

		#ax = axes[0]
		if full_range:
			heatmap = ax.imshow(d, cmap=matplotlib.cm.Oranges_r,interpolation='nearest',
								vmin=0,vmax=1,
								origin='lower',
							   extent=[0,100,0,100]) #(left, right, bottom, top)
		else:
			heatmap = ax.imshow(d, cmap=matplotlib.cm.Oranges_r,interpolation='nearest',
								origin='lower',
								extent=[0,100,0,100]) #(left, right, bottom, top)
		
	
		nticks = int(100/step) +1
		ticks = [x*step for x in range(nticks)]
		ax.set_xticks(ticks, minor=False)
		ax.set_yticks(ticks, minor=False)
		ax.set_xlabel('Reference')
		ax.set_ylabel('Comparison')

		cbar = fig.colorbar(heatmap, ax=ax)
		cbar.ax.invert_yaxis()
		
		title = "Mean relative ranks by difference ('reciprocity') by percentile:%s\n(darker is smaller difference, more 'reciprocal')\n" % name
		title += "$R^2$=%0.4f, Ref coef=%0.4f, Comp coef=%0.4f" % (lm.score(X,y), lm.coef_[0], lm.coef_[1])
		
		plt.suptitle(title)
		plt.show()
	
		#now scatterplot
		fig, ax = plt.subplots()
		fig.set_size_inches(5, 5)
		#ax = axes[1]
		ax.scatter(X2,y,color='purple')
		ax.invert_yaxis() #so that ranks closer to 0 are higher
		ax.set_xlabel('Percentile')
		ax.set_ylabel('Relative rank difference')
		ax.set_title("Mean relative ranks by difference ('reciprocity') vs percentile\nPearson r = %0.4f, p = %0.4f" % (pearson[0], pearson[1]))
		
		#plt.subplots_adjust(hspace=0.4)
		
		#plt.suptitle("Average relative rank difference by percentile: %s\n(darker is more reciprocal)" % name)
		#stitle = "Mean relative ranks by difference ('reciprocity') by percentile:%s\n(darker is smaller difference, more 'reciprocal')\n" % name
		#stitle += "$R^2$=%0.4f, Ref coef=%0.4f, Comp coef=%0.4f" % (lm.score(X,y), lm.coef_[0], lm.coef_[1])
		#plt.suptitle("Mean relative ranks by difference ('reciprocity') by percentile: %s\n(darker is smaller difference, more 'reciprocal')\n$R^2$=%0.4f, Ref coef=%0.4f, Comp coef=%0.4f" % (name,lm.score(X,y), lm.coef_[0], lm.coef_[1])) #lm.intercept_
		#plt.suptitle(stitle)
		plt.show()

	@staticmethod
	def compare_recip_rank_comparisons(combo,name,samples=100,step=5,full_range=True):
		"""
		show the average diff of rank(wd1,wd2) and rank(wd2,wd1) comparison for all of the items in combo
		"""
	
		Utils.compare_methods(combo,name,Plotting.show_recip_rank_comparison,samples=samples,step=step,full_range=full_range)
		
#################
class VectorCalcs(object):
	"""
	Functions relating to individual vectors
	"""
	
	@staticmethod
	def vectors_vs_percentile(combo,lower_percentile=0,upper_percentile=100):
		"""
		Compute linear regression for vector vs percentile for each method

		return list of method R^2
		"""

		sampler = combo['sampler']

		words = sampler.words_in_range(lower_percentile=lower_percentile,
									   upper_percentile=upper_percentile)
		what = []
	
		for m in ['sgns','ft','glove','ppmi']:
			vecs = combo[m]
			X = [vecs.get_vector(w) for w in words]
			y = [sampler.get_percentile(w) for w in words]
		
			lm = linear_model.LinearRegression()
			model = lm.fit(X,y)

			what.append([m, lm.score(X,y)])
		
		return (what, len(words))
	
	@staticmethod
	def compare_dimensions_vs_percentiles(combo,lower_percentile=0,upper_percentile=100):
		"""
		return list of R^2 for each dimension for each method, for words in percentile range    
		"""
		
		sampler = combo['sampler']

		words = sampler.words_in_range(lower_percentile=lower_percentile, upper_percentile=upper_percentile)

		nwords = len(words)
		vsize = combo['sgns'].vector_size #all 4 models have the same number of dimensions
	
		what = []
	
		for i in range(vsize):
			row = [i]
			for m in ['sgns','ft','glove','ppmi']:
				vecs = np.array([combo[m].get_vector(w) for w in words])
			
				X = vecs[:,i]
				X = X.reshape((len(X),1))
				y = [sampler.get_percentile(w) for w in words]

				lm = linear_model.LinearRegression()
				model = lm.fit(X,y)
				row.append(lm.score(X,y))
			what.append(row)
		
		return (what, nwords)
		
	@staticmethod
	def ave_nn_percentiles(sampler,vecs,words,k=1000):
		"""
		for each word find its k nearest neighbors
		then find the percentiles of those k-nn

		for each rank, find the average percentiles of the words at that rank

		return array of rank, average percentile
		"""

		what = [0] * k
		for w in words:
			nns = [x[0] for x in vecs.similar_by_word(w,topn=k)]
			for i in range(k):
				what[i] += sampler.get_percentile(nns[i])

		return np.array(what)/len(words)
	
	@staticmethod
	def show_ave_nn_percentiles(sampler,vecs,words,name,k=1000):
		"""
		Calculate ave_percentile, plot both normal and log log
		do linear model for both (for log-log that gives "power law")
		"""

		ars = VectorCalcs.ave_nn_percentiles(sampler,vecs,words,k=k)

		plt.figure(figsize=(12,5))

		plt.subplot(1, 2, 1)
		lm = linear_model.LinearRegression()
		X = np.array(range(1,k))
		X = X.reshape(k-1,1)
		y = np.array(ars[1:])
		y = y.reshape(k-1,1)
		model = lm.fit(X,y)
		plt.scatter(list(range(k)), ars, marker='o', color='orange')
		line = lambda x: model.coef_[0][0] * x + model.intercept_[0]
		plt.plot([1,k],[line(1), line(k)], color='purple')
		plt.xlabel('NN-Rank')
		plt.ylabel('Ave Percentile')
		plt.title('Simple: afr = %0.4f * nnr + %0.4f, $R^2 = %0.4f$' % 
				  (model.coef_[0][0], model.intercept_[0], model.score(X,y)))

		plt.subplot(1,2,2)
		lm = linear_model.LinearRegression()
		X = np.log10(np.array(range(1,k)))
		X = X.reshape(k-1,1)
		y = np.log10(np.array(ars[1:]))
		y = y.reshape(k-1,1)
		model = lm.fit(X,y)

		plt.scatter(list(range(k)), ars, marker='o', color='orange')
		line = lambda x: (10** model.intercept_[0]) * (x ** model.coef_[0][0])
		plt.plot([1,k],[line(1), line(k)], color='purple')
		plt.xscale('log')
		plt.yscale('log')
		plt.xlabel('Log NN-Rank')
		plt.ylabel('Log Ave Percentile')
		plt.title('Log-Log: afr = $10^{%0.4f} * nnr^{%0.4f}$, $R^2 = %0.4f$' % 
				(model.intercept_[0], model.coef_[0][0], model.score(X,y)))

		plt.subplots_adjust(wspace=0.5)

		plt.suptitle("Average frequency ranks for %d nearest neighbors in %s" % (k,name))
		plt.show()

	@staticmethod
	def compare_ave_nn_percentiles(combo,name,k=1000):
		"""
		for each model, do show_ave_nn_percentiles with the whole vocab
		"""

		sampler = combo['sampler']
		for m in ['sgns','ft','glove','ppmi']:
			n2 = "%s, using %s" % (name, m)
			vecs = combo[m]
			words = vecs.vocab.keys()
			VectorCalcs.show_ave_nn_percentiles(sampler,vecs,words,n2,k=k)
	
	@staticmethod
	def ave_nn_ranks(sampler,vecs,words,k=1000):
		"""
		for each word find its k nearest neighbors
		then find the frequency ranks of those k-nn

		for each rank, find the average frequency ranks of the words at that rank

		return array of rank, average percentile
		"""

		what = [0] * k
		for w in words:
			nns = [x[0] for x in vecs.similar_by_word(w,topn=k)]
			for i in range(k):
				what[i] += vecs.vocab[nns[i]].index

		return np.array(what)/len(words)

	@staticmethod
	def show_ave_nn_ranks(sampler,vecs,words,name,k=1000):
		"""
		Calculate ave_nn_ranks, plot both normal and log log
		do linear model for both (for log-log that gives "power law")
		"""

		ars = VectorCalcs.ave_nn_ranks(sampler,vecs,words,k=k)

		plt.figure(figsize=(12,5))

		plt.subplot(1, 2, 1)
		lm = linear_model.LinearRegression()
		X = np.array(range(1,k))
		X = X.reshape(k-1,1)
		y = np.array(ars[1:])
		y = y.reshape(k-1,1)
		model = lm.fit(X,y)
		plt.scatter(list(range(k)), ars, marker='o', color='orange')
		line = lambda x: model.coef_[0][0] * x + model.intercept_[0]
		plt.plot([1,k],[line(1), line(k)], color='purple')
		plt.xlabel('NN-Rank')
		plt.ylabel('Ave Frequency Rank')
		plt.title('Simple: afr = %0.4f * nnr + %0.4f, $R^2 = %0.4f$' % 
				  (model.coef_[0][0], model.intercept_[0], model.score(X,y)))

		plt.subplot(1,2,2)
		lm = linear_model.LinearRegression()
		X = np.log10(np.array(range(1,k)))
		X = X.reshape(k-1,1)
		y = np.log10(np.array(ars[1:]))
		y = y.reshape(k-1,1)
		model = lm.fit(X,y)

		plt.scatter(list(range(k)), ars, marker='o', color='orange')
		line = lambda x: (10** model.intercept_[0]) * (x ** model.coef_[0][0])
		plt.plot([1,k],[line(1), line(k)], color='purple')
		plt.xscale('log')
		plt.yscale('log')
		plt.xlabel('Log NN-Rank')
		plt.ylabel('Log Ave Frequency Rank')
		plt.title('Log-Log: afr = $10^{%0.4f} * nnr^{%0.4f}$, $R^2 = %0.4f$' % 
				  (model.intercept_[0], model.coef_[0][0], model.score(X,y)))

		plt.subplots_adjust(wspace=0.5)

		plt.suptitle("Average frequency ranks for %d nearest neighbors in %s" % (k,name))
		plt.show()

	@staticmethod
	def compare_ave_nn_ranks(combo,name,k=1000,lower_percentile=0, upper_percentile=100):
		"""
		for each model, do show_ave_nn_ranks with the whole vocab
		"""
		
		sampler = combo['sampler']
		words = sampler.words_in_range(lower_percentile=lower_percentile, upper_percentile=upper_percentile)
		
		for m in ['sgns','ft','glove','ppmi']:
			n2 = "%s, using %s with percentiles %d - %d" % (name, m, lower_percentile, upper_percentile)
			vecs = combo[m]
			#words = vecs.vocab.keys()
			VectorCalcs.show_ave_nn_ranks(sampler,vecs,words,n2,k=k)
			
	@staticmethod
	def plot_sims_with_mean_by_band(sampler, vecs, name, step=5):
		"""

		histograms of the cos sim of the vectors w.r.t. the (global) mean vector  in rank percentile bands of width step
		
		"""

		ncols = 4
		nrows = int(math.ceil(100/(step*ncols))) +1
	
		fig, axes = plt.subplots(nrows=nrows, ncols=ncols, sharex=False, sharey=False, figsize=(10,10))
	
		mv = np.mean(vecs.vectors, axis=0)
			
		means = []
		pnum = 0
		for i in range(0,vecs.vector_size,step):
			words = sampler.words_in_range(i,i+step)
			vs = [vecs.get_vector(w) for w in words]			
			#mv = np.mean(vs, axis=0) #this is the frequency band mean
			d = np.dot(vs,mv)
			mn = np.mean(d)
			means.append(mn)
			var = np.var(d)

			pnum += 1
			plt.subplot(nrows,ncols,pnum)
			ax = plt.gca()
		
			n, bins, patches = plt.hist(d, density=True, color='orange')
			
			ax.spines["top"].set_visible(False)
			ax.spines["right"].set_visible(False)

			ax.axvline(x=0,color='gray')
			#ax.set_xlim((-0.25,1.0)) #by inspection, lowest bound is > -0.25
		
			title = '%d - %d\nmean: %0.3f' % (i, i+step, mn)
			#title = '%d - %d\nmean: %0.3f, var: %0.3f' % (i, i+step, mn, var)
			ax.set_xlabel('dot product')
			ax.set_ylabel('density')
			ax.set_title(title)
			
		#hide rest of last row (dumb)
		for i in range(int(math.ceil(100/step) % ncols), ncols):
			axes[-2, i].axis('off')
		
		#now plot means
		plt.subplot(nrows,ncols,ncols*(nrows-1)+1)
		ax = plt.gca()
		ax.plot(means, color='purple')
		ax.axhline(y=0,color='gray')
		
		ax.spines["top"].set_visible(False)
		ax.spines["right"].set_visible(False)
		
		locs = list(range(0,1+len(means),2))
		labs = [x * step for x in locs]
		ax.set_xticks(locs)
		ax.set_xticklabels(labs)
		ax.set_ylim(-1,1)
		ax.set_xlabel('band')
		ax.set_ylabel('mean')
		ax.set_title('Means of bands')
		#hide rest of row (dumb)
		for i in range(1,ncols):
			axes[-1, i].axis('off')
		
		
		plt.suptitle('Distribution of dot product with mean vector by percentile band: %s' % name)
		plt.subplots_adjust(hspace=0.75,wspace=0.5)
		plt.show()
		
	@staticmethod
	def compare_sims_with_mean(combo,name,step=5):
		"""
		do plot_sims_with_mean_by_band for each method
		"""
		
		Utils.compare_methods(combo,name,VectorCalcs.plot_sims_with_mean_by_band,step=step)
		

	@staticmethod
	def plot_dims(sampler, vecs, name, ndims=10):
		"""
		histograms of the values in the first ndims dimensions
		
		sampler isn't actually used; it's just so we can use the Utils.compare_methods
		"""

		ncols = 5
		nrows = int(math.ceil(ndims/ncols)) +1
	
		fig, axes = plt.subplots(nrows=nrows, ncols=ncols, sharex=True, sharey=False, figsize=(10,10))

		for i in range(ndims):
	
			plt.subplot(nrows,ncols,i+1)
			ax = plt.gca()

			d = vecs.vectors[:,i]
			n, bins, patches = plt.hist(d, density=True, color='orange')

			ax.spines["top"].set_visible(False)
			ax.spines["right"].set_visible(False)

			ax.axvline(x=0,color='gray')
			ax.set_xlim((-0.5,0.5)) #by inspection, lowest bound is > -0.25

			title = 'Dimension %d' % i
			ax.set_xlabel('value')
			ax.set_ylabel('density')
			ax.set_title(title)

		means = np.mean(vecs.vectors, axis = 0)

		#now plot means
		plt.subplot(nrows,ncols,ncols*(nrows-1)+1)
		ax = plt.gca()
		ax.scatter(np.arange(0,len(means)), means, color='purple')
		ax.axhline(y=0,color='gray')
		
		locs = list(range(0,1+len(means),20))
		ax.set_xticks(locs)

		ax.set_xlabel('dimension')
		ax.set_ylabel('mean')
		ax.set_title('Means of all dimensions\nMean: %0.4f' % np.mean(means))
		#hide rest of row (dumb)
		for i in range(1,ncols):
			axes[-1, i].axis('off')

		plt.suptitle('Distribution dimensions values: %s' % name)
		plt.subplots_adjust(hspace=0.5,wspace=0.5)
		plt.show()
		
	@staticmethod
	def compare_dims(combo,name,ndims=10):
		"""
		do plot_sims_with_mean_by_band for each method
		"""
		
		Utils.compare_methods(combo,name,VectorCalcs.plot_dims,ndims=ndims)

###########		
class Hubs(object):
	"""
	functions concerning hubs
	"""
	
	@staticmethod
	def nn_k(nearest,k,xs,ys):
		"""
		using nearest(), for each x, calculate NN_k(x,ys), i.e. how many times x is in k-nearest w.r.t. ys
	
		nearest has signature: nearest(word,topn), returning list of (item,sim) 
			i.e. following gensim similar_by_word
		"""
	
		xss = set(xs)
		what = Counter([z[0] for y in ys for z in nearest(y,topn=k) if z[0] in xss])
		results = list(what.items())
		#need to add in 0s, just in case
		for x in xss:
			if x not in what:
				results.append((x,0))
		return results

	@staticmethod
	def nn_k_by_percentile(sampler,vecs,name,k=1000,max_words=1000,steps=5,words_per_step=2):
		"""
		samples words per step and calculates their nn_k w.r.t an overall sample of up to max_words
	
		shows graph of results
		"""
		
		range_words = sampler.words_in_range(lower_percentile=0,upper_percentile=100)
		num_words = min(len(range_words),max_words)
		words = random.sample(range_words,num_words)
	
		base_words = [y for x in range(0,100-steps+1,steps) 
					  for y in random.sample(sampler.words_in_range(x,x+steps),words_per_step) ]

		what = []

		for word in base_words:
			info = [word, sampler.get_percentile(word)]
			h = Hubs.nn_k(vecs.similar_by_word,k,[word],words)
			info.append(h[0][1])
			what.append(info)

		df = pd.DataFrame(what)
		nnname = 'nn_%d' % k
		df.columns = ['word','percentile', nnname]

		#d['dev from mean'] = (d[nnname]-d[nnname].mean())/d[nnname].std() #not relevant, since not normal distribution
		df = df.sort_values(by=[nnname], ascending=False).sort_values(by=['percentile'])

		m = df[nnname].mean()
		std = df[nnname].std()
		
		title = 'nn_k compared to %d words in %s\nmean: %0.4f, std: %0.4f' % (num_words,name,m,std)

		plt.plot(df['percentile'],df[nnname])
		plt.plot(df['percentile'],df[nnname], color='orange', marker='o')
		plt.xlabel('word percentile')
		plt.hlines(y=[m,m+2*std,m+3*std],xmin=0,xmax=100,colors=['gray'],linestyles=['solid','dashed','dotted'])
		plt.text(45,m+2*std+2,'2 std')
		plt.text(45,m+3*std+2,'3 std')
		plt.title(title)
		plt.show()
		
	@staticmethod
	def compare_nn_k_by_percentile(combo,name,k=1000,max_words=1000,steps=5,words_per_step=2):
		"""
		do nn_k_by_percentile for all methods
		"""
		Utils.compare_methods(combo,name,Hubs.nn_k_by_percentile,k=1000,max_words=1000,steps=5,words_per_step=2)
		
	
	@staticmethod
	def hubs(nearest,k,xs,ys,thresh=2):
		"""
		calculate nn_k
		hub is thresh * standard deviation
			Even though distribution of nn_k isn't normal, std is a reasonable heuristic.
		
		return tuple of: mean, std, dataframe with (x,nn_k(x)), sorted by nn_k [i.e. the xs are the hubs w.r.t. the ys]
		"""
	
		df = pd.DataFrame(Hubs.nn_k(nearest,k,xs,ys))
		df.columns = ['word','nn_k']
		m = df['nn_k'].mean()
		std = df['nn_k'].std()
		df['std'] = (df['nn_k'] - m)/std
		df.sort_values(by=['std'], ascending=False, inplace=True)
		return m,std,df.query('std > %d' % thresh)

		
	@staticmethod
	def find_hubs(sampler,vecs,potential_hubs,other_words,k=1000,thresh=4):
		"""
		find the actual hubs from among the potentials w.r.t. other_words for nn_k
		
		thresh is number of standard deviations for cutoff to be a hub
			Even though distribution of nn_k isn't normal, std is a reasonable heuristic.
		add the percentiles of the hubs
		
		return the overall mean and standard deviation and the dataframe with word,percentile,nn_k,std (=# of standard deviations from mean), 

		"""
		
		(m,std,df) = Hubs.hubs(vecs.similar_by_word,k,potential_hubs,other_words,thresh=thresh)
		df['percentile'] = [sampler.get_percentile(x) for x in list(df['word'])]
		df = df[['word','percentile','nn_k','std']]
		df.rename(columns={'std': '# stds'}, inplace=True)
		
		return m,std,df

	@staticmethod
	def find_hubs_with_all(sampler,vecs,k=1000,thresh=4):
		"""
		convenience method for using the whole vocabulary in find_hubs
		"""
		
		#we need to get the vocab from the vectors because Glove includes <unk> and others don't, 
		#but we can't add <unk> to sample.words because that messes up the other vectors
		potentials = others = list(vecs.vocab.keys())
		
		return Hubs.find_hubs(sampler,vecs,potentials,others,k=k,thresh=thresh)

	@staticmethod
	def find_hubs_for_band(sampler,vecs,k=1000,thresh=4,step=5):
		"""
		use each band of width step of the vocabulary as ys; use whole vocab as xs
		i.e. this looks for hubs that are _for_ the bands
		
		return list of triples for each band: mean,standard deviation, datafrme with data
		"""
		
		"""
		we need to get the vocab from the vectors because Glove includes <unk> and others don't, 
		but we can't add <unk> to sample.words because that messes up the other vectors
		"""
	
		potentials = list(vecs.vocab.keys())
	
		what = []
		for i in range(0,100-step+1,step):
			others = sampler.words_in_range(i,i+step)
			results = Hubs.find_hubs(sampler,vecs,potentials,others,k=1000,thresh=4)
			what.append(results)

		return what

###########
class EFreqs(object):
	"""
	class for testing/summarizing frequency effects for embeddings
	"""
	
	# for vectors
	@staticmethod
	def model_vectors(sampler,vecs,lower_percentile=0,upper_percentile=100):
		"""
		Compute linear regression for vector vs percentile

		return R^2
		"""

		words = sampler.words_in_range(lower_percentile=lower_percentile,
									upper_percentile=upper_percentile)
	
		X = [vecs.get_vector(w) for w in words]
		y = [sampler.get_percentile(w) for w in words]

		lm = linear_model.LinearRegression()
		model = lm.fit(X,y)

		return lm.score(X,y)

	@staticmethod
	def model_vectors_compare(sampler, vecs, min_percentiles=[0,1,5]):
		"""
		Compute linear regression for vector vs percentile, with thresholds for minimum percentile to use
	
		return (min_percentile, R^2) for each
		"""
	
		return [(x, EFreqs.model_vectors(sampler,vecs,lower_percentile=x)) for x in min_percentiles]
	
	#for geometry
	@staticmethod
	def plot_mean_of_sims_with_mean_by_band(sampler,vecs,name,step=5):
		"""
		means of the cos sim of the vectors w.r.t. the (global) mean vector  in rank percentile bands of width step 
		"""

		mv = np.mean(vecs.vectors, axis=0)

		means = []
		for i in range(0,100,step):
			words = sampler.words_in_range(i,i+step)
			vs = [vecs.get_vector(w) for w in words]
			d = np.dot(vs,mv)
			mn = np.mean(d)
			means.append(mn)
		
		#adjust y limits
		if max(means) > 0.5:
			maxy = 1.0
		else:
			maxy = 0.5
	
		mmean = min(means)
		if mmean < 0:
			miny = min(mmean,-0.5) #we'll probably neve have less thatn -0.5, but just in case
		else:
			miny = 0
	
		fig, ax = plt.subplots()
	
		ax.plot(means, color='purple')
		ax.axhline(y=0,color='gray')
	
		ax.spines["top"].set_visible(False)
		ax.spines["right"].set_visible(False)

		locs = list(range(0,1+len(means),2))
		labs = [x * step for x in locs]
		ax.set_xticks(locs)
		ax.set_xticklabels(labs)
		ax.set_ylim((miny,maxy))
		ax.set_xlabel('band')
		ax.set_ylabel('mean')
		ax.set_title('Means of bands')
	
		plt.title('Distribution of dot product with mean vector by percentile band in %s' % name)

		plt.show()
		
	@staticmethod
	def plot_dim_means(vecs,name):
		"""
		plot the means of the dimension values
		"""

		means = np.mean(vecs.vectors, axis = 0)

		fig, ax = plt.subplots()
		ax.scatter(np.arange(0,len(means)), means, color='purple')
		ax.axhline(y=0,color='gray')

		locs = list(range(0,1+len(means),20))
		ax.set_xticks(locs)

		ax.set_xlabel('dimension')
		ax.set_ylabel('mean')
		ax.set_title('Means of all dimensions in %s\nMean: %0.4f' % (name,np.mean(means)))   
	
		plt.show()
	
	#for shifted mean of similarities
	@staticmethod
	def describe_sims(sampler, vecs, samples, name, lower_percentile=0, upper_percentile=100, plot=False):
		"""
		Find the descriptive statistics of a sample of the similarities. Optionally plot the estimated density
	
		Should use a high number of samples, e.g. 100000 for vfair
	
		return dict of mean, variance, skewness, kurtosis
		"""
	
		usims = sampler.uniform_sample_sims(vecs, samples, lower_percentile=lower_percentile, upper_percentile=upper_percentile)
		desc = stats.describe([x[2] for x in usims])
	
		if plot:
			title = "%s percentiles %d-%d" % (name,lower_percentile,upper_percentile)
			Plotting.plot_histogram(usims,title,histogram=False,rug=True)
		
		return {'mean':desc.mean, 'variance':desc.variance, 'skewness':desc.skewness, 'kurtosis':desc.kurtosis}
		
	## for stratification
	@staticmethod
	def model_frequencies(sampler,vecs,samples=1000,step=5):
		"""
		calculate linear model of sims by frequency percentiles
		return R^2 and the 2 coefficients
		"""

		d = []
		X = []
		y = []

		rng = range(0,100-step+1,step)
		for i in rng:
			row = []
			for j in rng:
				rsims = sampler.uniform_sample_sims_ranges(vecs,samples,range1=(i,i+step),range2=(j,j+step))
				m = stats.describe([x[2] for x in rsims]).mean
				row.append(m)
				X.append([i,j])
				y.append(m)
		
		lm = linear_model.LinearRegression()
		model = lm.fit(X,y)
		return lm.score(X,y), lm.coef_[0], lm.coef_[1]

	@staticmethod
	def model_ranks(sampler,vecs,samples=100,step=5):
		"""
		calculate linear model of ranks by frequency percentiles
		return R^2 and the 2 coefficients (overall), and Pearson coeff, prob for rel and comp
		i.e. R^2,coef_rel, coef_comp, (pearson_rel, prob_rel), (pearson_comp, prob_comp)
		"""

		nwds = len(sampler.counts)
		d = []
		X = []
		X2 = []
		X3 = []
		y = []
		rng = range(0,100-step+1,step)

		for j in rng: 
			row = []
			for i in rng: 
				ranks = Plotting.rank_ranges(sampler,vecs,range1=(i,i+step),range2=(j,j+step),samples=samples)
				m = stats.describe(ranks).mean/nwds
				row.append(m)
				X.append([i,j])
				X2.append(i)
				X3.append(j)
				y.append(m)
		

		lm = linear_model.LinearRegression()
		model = lm.fit(X,y)
		
		pearson_rel = pearsonr(X2,y)
		pearson_cmp = pearsonr(X3,y)
		
		return lm.score(X,y), lm.coef_[0], lm.coef_[1], pearson_rel, pearson_cmp
		
	@staticmethod
	def power_law(sampler,vecs,name,k=1000):
		"""
		show_ave_nn_ranks with the whole vocab, 
		i.e. modeling NN_k and looking for "power law"; plot on both normal and log-log scales
		"""

		words = sampler.words_in_range(lower_percentile=0,upper_percentile=100)
		VectorCalcs.show_ave_nn_ranks(sampler,vecs,words,name,k=k)
		
	def power_law_stratified(sampler,vecs,name,k=1000,steps=10,words_per_step=100):
		"""
		show_avv_nn_ranks with a stratfied sampling of the vocabulary: nwords in each of steps percentiles
		"""
		
		words = []
		for x in range(0,100-steps+1,steps):
			sample = sampler.words_in_range(x,x+steps)
			words += random.sample(sample,min(len(sample),words_per_step))
			
		VectorCalcs.show_ave_nn_ranks(sampler,vecs,words,name,k=k)
		

	@staticmethod
	def model_reciprocity(sampler,vecs,samples=100,step=5):
		"""
		return pearson(percentile_diff, rank_diff), p)
		"""

		nwds = len(sampler.counts)
		d = []
		X = []
		X2 = []
		y = []
		rng = range(0,100-step+1,step)
		for j in rng: 
			row = []
			for i in rng: 
				ranks1,ranks2 = Plotting.rank_ranges(sampler,vecs,range1=(i,i+step),range2=(j,j+step),samples=samples,include_reverse=True)
				ranks = abs(np.array(ranks1)-np.array(ranks2))
				m = stats.describe(ranks).mean/nwds
				row.append(m)
				#X.append([i,j])
				X2.append(abs(i-j))
				y.append(m)
			d.append(row)

		#lm = linear_model.LinearRegression()
		#model = lm.fit(X,y)
		pearson = pearsonr(X2,y)
		#return lm.score(X,y), lm.coef_[0], lm.coef_[1], pearson
		return pearson
		#lm = linear_model.LinearRegression()
		#model = lm.fit(X2,y)
		#return lm.score(X,y), lm.coef_[0], lm.coef_[1]
		
	#for Hubs
	def find_hubs_with_all(sampler,vecs,k=1000,thresh=4):
		"""
		just an alias, so we have all our testing/summarization together
		"""
		return Hubs.find_hubs_with_all(sampler=sampler,vecs=vecs,k=k,thresh=thresh)
	
	@staticmethod
	def hub_percentiles(sampler,vecs,k=1000,threshold=2,step=5,topn=20):
		"""
		use each band of width step of the vocabulary as others; use whole vocab as potentials
		i.e. this looks for hubs that are _for_ the bands
	
		this only does a summary
	
		return number of hubs, number of hubs whose percentile is in the same band as they are a hub for
		"""
	
		what = Hubs.find_hubs_for_band(sampler,vecs,k=k,thresh=threshold,step=step)
	
		counts = Counter()
		same_bands = 0
		num_hubs = 0
		for i,(m,std,df_) in enumerate(what):
			df = df_[:topn].copy()
			counts.update(list(df['word']))
			df['same band'] = (df['percentile'] >= i) & (df['percentile'] <= i+step)
			same_bands += len(df[df['same band']])
			num_hubs += len(df)
			
		if num_hubs > 0:
			d = [[w,sampler.get_percentile(w),c] for (w,c) in counts.most_common()]
			
			pcounts = Counter()
			for (_,p,c) in d:
				pcounts.update({p:c})
			pdata = [pcounts[i] if i in pcounts else 0 for i in range(0,101)]
		
			fig, ax = plt.subplots(figsize=(10, 2))
		
			ax.bar(np.arange(0,101), pdata, color='orange')
			ax.set_xticks(np.arange(0,101,10))
			ax.set_xlabel('percentile')
			ax.set_ylabel('count')

			ax.spines["top"].set_visible(False)
			ax.spines["right"].set_visible(False)
		
			plt.show()
	
		else:
			pass

		return num_hubs, same_bands

#################
class Utils(object):
	"""
	various utility functions
	"""
	
	@staticmethod
	def compare_methods(combo,name,func,**kwargs):
		"""
		Apply func to all of the methods in combo
		
		function signature is func(sampler,vectors, name,**kwargs)
		"""
	
		sampler = combo['sampler']
		for m in ['sgns','ft','glove','ppmi']:
			func(sampler, combo[m], '%s, using %s' % (name,m), **kwargs)
			print()
		
	
