diff --git a/.travis.yml b/.travis.yml index a502794..2afe9c4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,7 +1,7 @@ language: python python: - - "2.7" + - "3.6" sudo: false diff --git a/README.rst b/README.rst index b8aeaf9..ec1e0cb 100644 --- a/README.rst +++ b/README.rst @@ -9,6 +9,7 @@ mrec recommender systems library Introduction ------------ +This fork is Python 3 only. `mrec` is a Python package developed at `Mendeley `_ to support recommender systems development and evaluation. The package currently focuses on item similarity and other methods that work well on implicit feedback, and on experimental evaluation. Why another package when there are already some really good software projects implementing recommender systems? diff --git a/doc/conf.py b/doc/conf.py index e2b3948..9c2fe47 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -51,10 +51,10 @@ try: release = pkg_resources.get_distribution('mrec').version except pkg_resources.DistributionNotFound: - print 'To build the documentation, The distribution information of mrec' - print 'has to be available. Either install the package into your' - print 'development environment or run "python setup.py develop" to setup' - print 'the metadata.' + print('To build the documentation, The distribution information of mrec') + print('has to be available. Either install the package into your') + print('development environment or run "python setup.py develop" to setup') + print('the metadata.') sys.exit(1) del pkg_resources version = '.'.join(release.split('.')[:2]) diff --git a/mrec/__init__.py b/mrec/__init__.py index dece6e3..0492a66 100644 --- a/mrec/__init__.py +++ b/mrec/__init__.py @@ -1,18 +1,12 @@ -from itertools import izip -import numpy as np -from scipy.sparse import coo_matrix, csr_matrix from scipy.io import mmread, mmwrite -try: - import cPickle as pickle -except ImportError: - import pickle -from sparse import fast_sparse_matrix, loadtxt, loadz, savez -from base_recommender import BaseRecommender +from mrec.base_recommender import BaseRecommender +from mrec.sparse import fast_sparse_matrix, loadtxt, loadz, savez __version__ = '0.3.1' -def load_fast_sparse_matrix(input_format,filepath): + +def load_fast_sparse_matrix(input_format, filepath): """ Load a fast_sparse_matrix from an input file of the specified format, by delegating to the appropriate static method. @@ -31,14 +25,15 @@ def load_fast_sparse_matrix(input_format,filepath): if input_format == 'tsv': return fast_sparse_matrix.loadtxt(filepath) elif input_format == 'csv': - return fast_sparse_matrix.loadtxt(filepath,delimiter=',') + return fast_sparse_matrix.loadtxt(filepath, delimiter=',') elif input_format == 'mm': return fast_sparse_matrix.loadmm(filepath) elif input_format == 'fsm': return fast_sparse_matrix.load(filepath) raise ValueError('unknown input format: {0}'.format(input_format)) -def load_sparse_matrix(input_format,filepath): + +def load_sparse_matrix(input_format, filepath): """ Load a scipy.sparse.csr_matrix from an input file of the specified format. @@ -57,7 +52,7 @@ def load_sparse_matrix(input_format,filepath): if input_format == 'tsv': return loadtxt(filepath) elif input_format == 'csv': - return loadtxt(filepath,delimiter=',') + return loadtxt(filepath, delimiter=',') elif input_format == 'mm': return mmread(filepath).tocsr() elif input_format == 'npz': @@ -66,7 +61,8 @@ def load_sparse_matrix(input_format,filepath): return fast_sparse_matrix.load(filepath).X raise ValueError('unknown input format: {0}'.format(input_format)) -def save_sparse_matrix(data,fmt,filepath): + +def save_sparse_matrix(data, fmt, filepath): """ Save a scipy sparse matrix in the specified format. Row and column indices will be converted to 1-indexed if you specify a plain text @@ -88,24 +84,25 @@ def save_sparse_matrix(data,fmt,filepath): """ if fmt == 'tsv': m = data.tocoo() - with open(filepath,'w') as out: - for u,i,v in izip(m.row,m.col,m.data): - print >>out,'{0}\t{1}\t{2}'.format(u+1,i+1,v) + with open(filepath, 'w') as out: + for u, i, v in zip(m.row, m.col, m.data): + print('{0}\t{1}\t{2}'.format(u + 1, i + 1, v), file=out) elif fmt == 'csv': m = data.tocoo() - with open(filepath,'w') as out: - for u,i,v in izip(m.row,m.col,m.data): - print >>out,'{0},{1},{2}'.format(u+1,i+1,v) + with open(filepath, 'w') as out: + for u, i, v in zip(m.row, m.col, m.data): + print('{0},{1},{2}'.format(u + 1, i + 1, v), file=out) elif fmt == 'mm': - mmwrite(filepath,data) + mmwrite(filepath, data) elif fmt == 'npz': - savez(data.tocoo(),filepath) + savez(data.tocoo(), filepath) elif fmt == 'fsm': fast_sparse_matrix(data).save(filepath) else: raise ValueError('unknown output format: {0}'.format(fmt)) -def save_recommender(model,filepath): + +def save_recommender(model, filepath): """ Save a recommender model to file. @@ -118,6 +115,7 @@ def save_recommender(model,filepath): """ model.save(filepath) + def load_recommender(filepath): """ Load a recommender model from file after it has been saved by @@ -130,6 +128,7 @@ def load_recommender(filepath): """ return BaseRecommender.load(filepath) + def read_recommender_description(filepath): """ Read a recommender model description from file after it has diff --git a/mrec/base_recommender.py b/mrec/base_recommender.py index ef5333f..fc9bf11 100644 --- a/mrec/base_recommender.py +++ b/mrec/base_recommender.py @@ -1,10 +1,9 @@ -try: - import cPickle as pickle -except ImportError: - import pickle +import pickle + import numpy as np from scipy.sparse import csr_matrix + class BaseRecommender(object): """ Minimal interface to be implemented by recommenders, along with @@ -23,7 +22,7 @@ class BaseRecommender(object): and the batch methods to recommend items. """ - def recommend_items(self,dataset,u,max_items=10,return_scores=True,item_features=None): + def recommend_items(self, dataset, u, max_items=10, return_scores=True, item_features=None): """ Recommend new items for a user. @@ -48,7 +47,7 @@ def recommend_items(self,dataset,u,max_items=10,return_scores=True,item_features """ raise NotImplementedError('you must implement recommend_items()') - def fit(self,train,item_features=None): + def fit(self, train, item_features=None): """ Train on supplied data. In general you will want to implement this rather than computing recommendations on @@ -63,7 +62,7 @@ def fit(self,train,item_features=None): """ raise NotImplementedError('you should implement fit()') - def save(self,filepath): + def save(self, filepath): """ Serialize model to file. @@ -84,9 +83,9 @@ def save(self,filepath): archive = self._create_archive() if archive: - np.savez(filepath,**archive) + np.savez(filepath, **archive) else: - pickle.dump(self,open(filepath,'w')) + pickle.dump(self, open(filepath, 'wb')) def _create_archive(self): """ @@ -114,10 +113,10 @@ def load(filepath): The filepath to read from. """ r = np.load(filepath) - if isinstance(r,BaseRecommender): + if isinstance(r, BaseRecommender): model = r else: - model = np.loads(str(r['model'])) + model = np.loads(r['model']) model._load_archive(r) # restore any fields serialized separately return model @@ -144,15 +143,15 @@ def read_recommender_description(filepath): filepath : str The filepath to read from. """ - r = np.load(filepath,mmap_mode='r') - if isinstance(r,BaseRecommender): + r = np.load(filepath, mmap_mode='r') + if isinstance(r, BaseRecommender): model = r else: - model = np.loads(str(r['model'])) + model = np.loads(r['model']) return str(model) def __str__(self): - if hasattr(self,'description'): + if hasattr(self, 'description'): return self.description return 'unspecified recommender: you should set self.description or implement __str__()' @@ -190,12 +189,12 @@ def batch_recommend_items(self, this for most recommenders. """ recs = [] - for u in xrange(self.num_users): - if show_progress and u%1000 == 0: - print u,'..', - recs.append(self.recommend_items(dataset,u,max_items,return_scores)) + for u in range(self.num_users): + if show_progress and u % 1000 == 0: + print(u, '..', ) + recs.append(self.recommend_items(dataset, u, max_items, return_scores)) if show_progress: - print + print() return recs def range_recommend_items(self, @@ -234,9 +233,9 @@ def range_recommend_items(self, This provides a default implementation, you will be able to optimize this for most recommenders. """ - return [self.recommend_items(dataset,u,max_items,return_scores) for u in xrange(user_start,user_end)] + return [self.recommend_items(dataset, u, max_items, return_scores) for u in range(user_start, user_end)] - def _zero_known_item_scores(self,r,train): + def _zero_known_item_scores(self, r, train): """ Helper function to set predicted scores/ratings for training items to zero or less, to avoid recommending already known items. @@ -255,7 +254,7 @@ def _zero_known_item_scores(self,r,train): in train. """ col = train.indices - if isinstance(r,csr_matrix): + if isinstance(r, csr_matrix): max_score = r.data.max() else: max_score = r.max() @@ -264,9 +263,8 @@ def _zero_known_item_scores(self,r,train): # - we can't just use row,col = train.nonzero() as this eliminates # u,i for which train[u,i] has been explicitly set to zero row = np.zeros(col.shape) - for u in xrange(train.shape[0]): - start,end = train.indptr[u],train.indptr[u+1] + for u in range(train.shape[0]): + start, end = train.indptr[u], train.indptr[u + 1] if end > start: row[start:end] = u - return r - csr_matrix((data,(row,col)),shape=r.shape) - + return r - csr_matrix((data, (row, col)), shape=r.shape) diff --git a/mrec/evaluation/__init__.py b/mrec/evaluation/__init__.py index 6b6455a..8550749 100644 --- a/mrec/evaluation/__init__.py +++ b/mrec/evaluation/__init__.py @@ -11,18 +11,18 @@ class Evaluator(object): The number of recommendations needed to compute the evaluation function. """ - def __init__(self,compute_metrics,max_items): + def __init__(self, compute_metrics, max_items): self.compute_metrics = compute_metrics self.max_items = max_items - def _add_metrics(self,predicted,actual): - metrics = self.compute_metrics(predicted,actual) + def _add_metrics(self, predicted, actual): + metrics = self.compute_metrics(predicted, actual) if metrics: - for m,val in metrics.iteritems(): + for m, val in metrics.items(): self.cum_metrics[m] += val self.count += 1 - def process(self,testdata,recsfile,start,end,offset=1): + def process(self, testdata, recsfile, start, end, offset=1): """ Parameters ---------- @@ -54,19 +54,19 @@ def process(self,testdata,recsfile,start,end,offset=1): last_user = start recs = [] for line in open(recsfile): - user,item,score = line.strip().split('\t') - user = int(user)-1 # convert to 0-indxed - item = int(item)-1 + user, item, score = line.strip().split('\t') + user = int(user) - 1 # convert to 0-indxed + item = int(item) - 1 if user >= end: break if user < start: continue if user != last_user: - self._add_metrics(recs,testdata[last_user,:].indices.tolist()) + self._add_metrics(recs, testdata[last_user, :].indices.tolist()) last_user = user recs = [] if len(recs) < self.max_items: recs.append(item) - self._add_metrics(recs,testdata[last_user,:].indices.tolist()) + self._add_metrics(recs, testdata[last_user, :].indices.tolist()) - return self.cum_metrics,self.count + return self.cum_metrics, self.count diff --git a/mrec/evaluation/metrics.py b/mrec/evaluation/metrics.py index ec5a787..21f8988 100644 --- a/mrec/evaluation/metrics.py +++ b/mrec/evaluation/metrics.py @@ -4,47 +4,50 @@ * with prec@k and MRR """ +from collections import defaultdict + import numpy as np from scipy import stats -from collections import defaultdict + # classes to access known items for each test user class get_known_items_from_dict(object): - - def __init__(self,data): + def __init__(self, data): self.data = data - def __call__(self,u): + def __call__(self, u): return self.data[u] -class get_known_items_from_csr_matrix(object): - def __init__(self,data): +class get_known_items_from_csr_matrix(object): + def __init__(self, data): self.data = data - def __call__(self,u): + def __call__(self, u): return self.data[u].indices -class get_known_items_from_thresholded_csr_matrix(object): - def __init__(self,data,min_value): +class get_known_items_from_thresholded_csr_matrix(object): + def __init__(self, data, min_value): self.data = data self.min_value = min_value - def __call__(self,u): + def __call__(self, u): items = self.data[u].toarray().flatten() - items[items= self.thresh: if self.binarize: val = 1 else: val = 0 - return int(user),(int(item),val) + return int(user), (int(item), val) + class SplitCreator(object): """ @@ -52,45 +54,45 @@ class SplitCreator(object): sample_before_thresholding : bool (default: False) If True then consider any item seen by the user for inclusion in the test group, even though only items - with positive scrore will be selected. If the input + with positive score will be selected. If the input includes items with zero scores this means that the test set may be smaller than the requested size for some users, even though they have apparently seen enough items. """ - def __init__(self,test_size,normalize=False,discard_zeros=False,sample_before_thresholding=False): + def __init__(self, test_size, normalize=False, discard_zeros=False, sample_before_thresholding=False): self.test_size = test_size self.normalize = normalize self.discard_zeros = discard_zeros self.sample_before_thresholding = sample_before_thresholding - def handle(self,vals): + def handle(self, vals): if self.sample_before_thresholding: - train,test = self.split(vals) + train, test = self.split(vals) else: - train,test = self.stratified_split(vals) - train = [(v,c) for v,c in train if not self.discard_zeros or c > 0] - test = [(v,c) for v,c in test if c > 0] + train, test = self.stratified_split(vals) + train = [(v, c) for v, c in train if not self.discard_zeros or c > 0] + test = [(v, c) for v, c in test if c > 0] if self.normalize: - norm = sum(c*c for v,c in train)**0.5 + norm = sum(c * c for v, c in train) ** 0.5 if norm > 0: - train = [(v,c/norm) for v,c in train] - return train,test + train = [(v, c / norm) for v, c in train] + return train, test - def pos_neg_vals(self,vals): + def pos_neg_vals(self, vals): vals = list(vals) - pos = [(v,c) for v,c in vals if c > 0] - neg = [(v,0) for v,c in vals if c == 0] - return pos,neg + pos = [(v, c) for v, c in vals if c > 0] + neg = [(v, 0) for v, c in vals if c == 0] + return pos, neg - def split(self,vals): + def split(self, vals): random.shuffle(vals) num_train = self.num_train(vals) - return vals[:num_train],vals[num_train:] + return vals[:num_train], vals[num_train:] - def stratified_split(self,vals): - pos,neg = self.pos_neg_vals(vals) + def stratified_split(self, vals): + pos, neg = self.pos_neg_vals(vals) random.shuffle(pos) train = pos[:self.num_train(pos)] if not self.discard_zeros: @@ -98,9 +100,9 @@ def stratified_split(self,vals): train.extend(neg[:self.num_train(neg)]) random.shuffle(train) test = pos[self.num_train(pos):] - return train,test + return train, test - def num_train(self,vals): + def num_train(self, vals): if self.test_size >= 1: - return len(vals)-self.test_size - return int(len(vals)*(1.0-self.test_size)) + return len(vals) - self.test_size + return int(len(vals) * (1.0 - self.test_size)) diff --git a/mrec/evaluation/tests/test_metrics.py b/mrec/evaluation/tests/test_metrics.py index d0b9bab..da84721 100644 --- a/mrec/evaluation/tests/test_metrics.py +++ b/mrec/evaluation/tests/test_metrics.py @@ -3,38 +3,42 @@ from mrec.evaluation import metrics + def test_sort_metrics_by_name(): - names = ['recall@10','z-score','auc','recall@5'] - expected = ['auc','recall@5','recall@10','z-score'] - assert_equal(expected,metrics.sort_metrics_by_name(names)) + names = ['recall@10', 'z-score', 'auc', 'recall@5'] + expected = ['auc', 'recall@5', 'recall@10', 'z-score'] + assert_equal(expected, metrics.sort_metrics_by_name(names)) + def test_prec(): - true = [2,8,6,4] - predicted = [6,5,8,7] - expected = [1,0.5,2./3.,0.5] - for k in xrange(1,5): - assert_equal(metrics.prec([],true,k),0) - assert_equal(metrics.prec(true,true,k),1) - assert_equal(metrics.prec(predicted,true,k),expected[k-1]) - assert_equal(metrics.prec(true,true,5),0.8) - assert_equal(metrics.prec(true,true,5,ignore_missing=True),1) - assert_equal(metrics.prec(predicted,true,5),0.4) - assert_equal(metrics.prec(predicted,true,5,ignore_missing=True),expected[3]) + true = [2, 8, 6, 4] + predicted = [6, 5, 8, 7] + expected = [1, 0.5, 2. / 3., 0.5] + for k in range(1, 5): + assert_equal(metrics.prec([], true, k), 0) + assert_equal(metrics.prec(true, true, k), 1) + assert_equal(metrics.prec(predicted, true, k), expected[k - 1]) + assert_equal(metrics.prec(true, true, 5), 0.8) + assert_equal(metrics.prec(true, true, 5, ignore_missing=True), 1) + assert_equal(metrics.prec(predicted, true, 5), 0.4) + assert_equal(metrics.prec(predicted, true, 5, ignore_missing=True), expected[3]) + def test_hit_rate(): - predicted = [6,5,8,7] - for true in [[],[2,8]]: - for k in xrange(1,5): + predicted = [6, 5, 8, 7] + for true in [[], [2, 8]]: + for k in range(1, 5): with assert_raises(ValueError): - metrics.hit_rate(predicted,true,k) + metrics.hit_rate(predicted, true, k) true = [5] - expected = [0,1,1,1] - for k in xrange(1,5): - assert_equal(metrics.hit_rate(predicted,true,k),expected[k-1]) + expected = [0, 1, 1, 1] + for k in range(1, 5): + assert_equal(metrics.hit_rate(predicted, true, k), expected[k - 1]) + def test_rr(): - true = [2,8,6,4] - predicted = [5,7,6,8] - expected = [0,0,1./3.,1./3.] - for k in xrange(1,5): - assert_equal(metrics.rr(predicted[:k],true),expected[k-1]) + true = [2, 8, 6, 4] + predicted = [5, 7, 6, 8] + expected = [0, 0, 1. / 3., 1. / 3.] + for k in range(1, 5): + assert_equal(metrics.rr(predicted[:k], true), expected[k - 1]) diff --git a/mrec/examples/convert.py b/mrec/examples/convert.py index 53442f5..b0775fe 100644 --- a/mrec/examples/convert.py +++ b/mrec/examples/convert.py @@ -2,13 +2,13 @@ Convert sparse matrix from one file format to another. """ -import os import subprocess -def tsv2mtx(infile,outfile): - num_users,num_items,nnz = 0,0,0 + +def tsv2mtx(infile, outfile): + num_users, num_items, nnz = 0, 0, 0 for line in open(infile): - u,i,v = line.strip().split() + u, i, v = line.strip().split() u = int(u) i = int(i) if u > num_users: @@ -16,12 +16,13 @@ def tsv2mtx(infile,outfile): if i > num_items: num_items = i nnz += 1 - headerfile = outfile+'.header' - with open(headerfile,'w') as header: - print >>header,'%%MatrixMarket matrix coordinate real general' - print >>header,'{0} {1} {2}'.format(num_users,num_items,nnz) - subprocess.check_call(['cat',headerfile,infile],stdout=open(outfile,'w')) - subprocess.check_call(['rm',headerfile]) + headerfile = outfile + '.header' + with open(headerfile, 'w') as header: + print('%%MatrixMarket matrix coordinate real general', file=header) + print('{0} {1} {2}'.format(num_users, num_items, nnz), file=header) + subprocess.check_call(['cat', headerfile, infile], stdout=open(outfile, 'w')) + subprocess.check_call(['rm', headerfile]) + def main(): from optparse import OptionParser @@ -29,12 +30,14 @@ def main(): from mrec import load_sparse_matrix, save_sparse_matrix parser = OptionParser() - parser.add_option('--input_format',dest='input_format',help='format of input dataset tsv | csv | mm (matrixmarket) | csr (scipy.sparse.csr_matrix) | fsm (mrec.sparse.fast_sparse_matrix)') - parser.add_option('--input',dest='input',help='filepath to input') - parser.add_option('--output_format',dest='output_format',help='format of output dataset(s) tsv | csv | mm (matrixmarket) | csr (scipy.sparse.csr_matrix) | fsm (mrec.sparse.fast_sparse_matrix)') - parser.add_option('--output',dest='output',help='filepath for output') - - (opts,args) = parser.parse_args() + parser.add_option('--input_format', dest='input_format', + help='format of input dataset tsv | csv | mm (matrixmarket) | csr (scipy.sparse.csr_matrix) | fsm (mrec.sparse.fast_sparse_matrix)') + parser.add_option('--input', dest='input', help='filepath to input') + parser.add_option('--output_format', dest='output_format', + help='format of output dataset(s) tsv | csv | mm (matrixmarket) | csr (scipy.sparse.csr_matrix) | fsm (mrec.sparse.fast_sparse_matrix)') + parser.add_option('--output', dest='output', help='filepath for output') + + (opts, args) = parser.parse_args() if not opts.input or not opts.output or not opts.input_format or not opts.output_format: parser.print_help() raise SystemExit @@ -44,11 +47,11 @@ def main(): if opts.input_format == 'tsv' and opts.output_format == 'mm': # we can do this without loading the data - tsv2mtx(opts.input,opts.output) + tsv2mtx(opts.input, opts.output) else: - data = load_sparse_matrix(opts.input_format,opts.input) - save_sparse_matrix(data,opts.output_format,opts.output) + data = load_sparse_matrix(opts.input_format, opts.input) + save_sparse_matrix(data, opts.output_format, opts.output) + if __name__ == '__main__': main() - diff --git a/mrec/examples/evaluate.py b/mrec/examples/evaluate.py index 24d6633..0bd17d0 100644 --- a/mrec/examples/evaluate.py +++ b/mrec/examples/evaluate.py @@ -4,8 +4,8 @@ to the training filepaths. """ -def main(): +def main(): import os import logging import glob @@ -16,21 +16,26 @@ def main(): from mrec.evaluation.metrics import compute_main_metrics, compute_hit_rate from mrec.evaluation import Evaluator from mrec.evaluation.metrics import print_report - from filename_conventions import get_testfile, get_recsfile + from mrec.examples.filename_conventions import get_testfile, get_recsfile - logging.basicConfig(level=logging.INFO,format='[%(asctime)s] %(levelname)s: %(message)s') + logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s: %(message)s') parser = OptionParser() - parser.add_option('--input_format',dest='input_format',help='format of training dataset(s) tsv | csv | mm (matrixmarket) | fsm (fast_sparse_matrix)') - parser.add_option('--test_input_format',dest='test_input_format',default='npz',help='format of test dataset(s) tsv | csv | mm (matrixmarket) | npz (numpy binary) (default: %default)') - parser.add_option('--train',dest='train',help='glob specifying path(s) to training dataset(s) IMPORTANT: must be in quotes if it includes the * wildcard') - parser.add_option('--recsdir',dest='recsdir',help='directory containing tsv files of precomputed recommendations') - parser.add_option('--metrics',dest='metrics',default='main',help='which set of metrics to compute, main|hitrate (default: %default)') - parser.add_option('--description',dest='description',help='description of model which generated the recommendations') - metrics_funcs = {'main':compute_main_metrics, - 'hitrate':compute_hit_rate} - - (opts,args) = parser.parse_args() + parser.add_option('--input_format', dest='input_format', + help='format of training dataset(s) tsv | csv | mm (matrixmarket) | fsm (fast_sparse_matrix)') + parser.add_option('--test_input_format', dest='test_input_format', default='npz', + help='format of test dataset(s) tsv | csv | mm (matrixmarket) | npz (numpy binary) (default: %default)') + parser.add_option('--train', dest='train', + help='glob specifying path(s) to training dataset(s) IMPORTANT: must be in quotes if it includes the * wildcard') + parser.add_option('--recsdir', dest='recsdir', help='directory containing tsv files of precomputed recommendations') + parser.add_option('--metrics', dest='metrics', default='main', + help='which set of metrics to compute, main|hitrate (default: %default)') + parser.add_option('--description', dest='description', + help='description of model which generated the recommendations') + metrics_funcs = {'main': compute_main_metrics, + 'hitrate': compute_hit_rate} + + (opts, args) = parser.parse_args() if not opts.input_format or not opts.train or not opts.recsdir \ or opts.metrics not in metrics_funcs: parser.print_help() @@ -39,7 +44,7 @@ def main(): opts.train = os.path.abspath(os.path.expanduser(opts.train)) opts.recsdir = os.path.abspath(os.path.expanduser(opts.recsdir)) - evaluator = Evaluator(metrics_funcs[opts.metrics],max_items=20) + evaluator = Evaluator(metrics_funcs[opts.metrics], max_items=20) trainfiles = glob.glob(opts.train) @@ -47,14 +52,15 @@ def main(): for trainfile in trainfiles: logging.info('processing {0}...'.format(trainfile)) testfile = get_testfile(trainfile) - recsfile = get_recsfile(trainfile,opts.recsdir) - testdata = load_sparse_matrix(opts.test_input_format,testfile).tocsr() - cum_metrics,count = evaluator.process(testdata,recsfile,0,testdata.shape[0]) + recsfile = get_recsfile(trainfile, opts.recsdir) + testdata = load_sparse_matrix(opts.test_input_format, testfile).tocsr() + cum_metrics, count = evaluator.process(testdata, recsfile, 0, testdata.shape[0]) if cum_metrics is not None: for m in cum_metrics: - all_metrics[m].append(float(cum_metrics[m])/count) + all_metrics[m].append(float(cum_metrics[m]) / count) + + print_report([opts.description], [all_metrics]) - print_report([opts.description],[all_metrics]) if __name__ == '__main__': main() diff --git a/mrec/examples/factors.py b/mrec/examples/factors.py index b2d54d6..2bd3366 100644 --- a/mrec/examples/factors.py +++ b/mrec/examples/factors.py @@ -3,9 +3,8 @@ and evaluation recommendations with mrec scripts. """ -def main(): - import os +def main(): import logging import subprocess from optparse import OptionParser @@ -14,19 +13,22 @@ def main(): from mrec import save_recommender from mrec.mf.recommender import MatrixFactorizationRecommender - from filename_conventions import get_modelfile + from mrec.examples.filename_conventions import get_modelfile - logging.basicConfig(level=logging.INFO,format='[%(asctime)s] %(levelname)s: %(message)s') + logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s: %(message)s') parser = OptionParser() - parser.add_option('--factor_format',dest='factor_format',help='format of factor files tsv | mm (matrixmarket) | npy (numpy array)') - parser.add_option('--user_factors',dest='user_factors',help='user factors filepath') - parser.add_option('--item_factors',dest='item_factors',help='item factors filepath') - parser.add_option('--train',dest='train',help='filepath to training data, just used to apply naming convention to output model saved here') - parser.add_option('--outdir',dest='outdir',help='directory for output') - parser.add_option('--description',dest='description',help='optional description of how factors were computed, will be saved with model so it can be output with evaluation results') - - (opts,args) = parser.parse_args() + parser.add_option('--factor_format', dest='factor_format', + help='format of factor files tsv | mm (matrixmarket) | npy (numpy array)') + parser.add_option('--user_factors', dest='user_factors', help='user factors filepath') + parser.add_option('--item_factors', dest='item_factors', help='item factors filepath') + parser.add_option('--train', dest='train', + help='filepath to training data, just used to apply naming convention to output model saved here') + parser.add_option('--outdir', dest='outdir', help='directory for output') + parser.add_option('--description', dest='description', + help='optional description of how factors were computed, will be saved with model so it can be output with evaluation results') + + (opts, args) = parser.parse_args() if not opts.factor_format or not opts.user_factors or not opts.item_factors \ or not opts.outdir: parser.print_help() @@ -54,12 +56,13 @@ def main(): logging.info('saving model...') logging.info('creating output directory {0}...'.format(opts.outdir)) - subprocess.check_call(['mkdir','-p',opts.outdir]) + subprocess.check_call(['mkdir', '-p', opts.outdir]) - modelfile = get_modelfile(opts.train,opts.outdir) - save_recommender(model,modelfile) + modelfile = get_modelfile(opts.train, opts.outdir) + save_recommender(model, modelfile) logging.info('done') + if __name__ == '__main__': main() diff --git a/mrec/examples/filename_conventions.py b/mrec/examples/filename_conventions.py index 0906827..bfbf778 100644 --- a/mrec/examples/filename_conventions.py +++ b/mrec/examples/filename_conventions.py @@ -10,42 +10,52 @@ import os + def get_testfile(trainfile): filename = os.path.basename(trainfile) - return os.path.join(os.path.dirname(trainfile),filename.replace('train','test')) + return os.path.join(os.path.dirname(trainfile), filename.replace('train', 'test')) + -def get_simsdir(trainfile,outdir): +def get_simsdir(trainfile, outdir): filename = os.path.basename(trainfile) - return os.path.join(outdir,'{0}-sims'.format(filename)) + return os.path.join(outdir, '{0}-sims'.format(filename)) + -def get_recsdir(trainfile,outdir): +def get_recsdir(trainfile, outdir): filename = os.path.basename(trainfile) - return os.path.join(outdir,'{0}-recs'.format(filename)) + return os.path.join(outdir, '{0}-recs'.format(filename)) -def get_modelsdir(trainfile,outdir): + +def get_modelsdir(trainfile, outdir): filename = os.path.basename(trainfile) - return os.path.join(outdir,'{0}-models'.format(filename)) + return os.path.join(outdir, '{0}-models'.format(filename)) + -def get_factorsdir(trainfile,outdir): +def get_factorsdir(trainfile, outdir): filename = os.path.basename(trainfile) - return os.path.join(outdir,'{0}-factors'.format(filename)) + return os.path.join(outdir, '{0}-factors'.format(filename)) -def get_simsfile(trainfile,outdir): + +def get_simsfile(trainfile, outdir): filename = os.path.basename(trainfile) - return os.path.join(outdir,'{0}.sims.tsv'.format(filename)) + return os.path.join(outdir, '{0}.sims.tsv'.format(filename)) + -def get_recsfile(trainfile,outdir): +def get_recsfile(trainfile, outdir): filename = os.path.basename(trainfile) - return os.path.join(outdir,'{0}.recs.tsv'.format(filename)) + return os.path.join(outdir, '{0}.recs.tsv'.format(filename)) + -def get_modelfile(trainfile,outdir): +def get_modelfile(trainfile, outdir): filename = os.path.basename(trainfile) - return os.path.join(outdir,'{0}.model.npz'.format(filename)) + return os.path.join(outdir, '{0}.model.npz'.format(filename)) -def get_sortedfile(infile,outdir): + +def get_sortedfile(infile, outdir): filename = os.path.basename(infile) - return os.path.join(outdir,'{0}.sorted'.format(filename)) + return os.path.join(outdir, '{0}.sorted'.format(filename)) + -def get_splitfile(infile,outdir,split_type,i): +def get_splitfile(infile, outdir, split_type, i): filename = os.path.basename(infile) - return os.path.join(outdir,'{0}.{1}.{2}'.format(filename,split_type,i)) + return os.path.join(outdir, '{0}.{1}.{2}'.format(filename, split_type, i)) diff --git a/mrec/examples/predict.py b/mrec/examples/predict.py index 6b8ab1d..bbba9f3 100644 --- a/mrec/examples/predict.py +++ b/mrec/examples/predict.py @@ -11,29 +11,26 @@ makes it easy to run a cross-validated evaluation. """ -import math import glob +import logging import re -import os import subprocess -from shutil import rmtree -import logging from collections import defaultdict +from shutil import rmtree from mrec import load_sparse_matrix, read_recommender_description, load_recommender -from mrec.parallel import predict -from mrec.mf.recommender import MatrixFactorizationRecommender +from mrec.examples.filename_conventions import * from mrec.item_similarity.recommender import ItemSimilarityRecommender +from mrec.mf.recommender import MatrixFactorizationRecommender +from mrec.parallel import predict -from filename_conventions import * - -ONE_MB = 2**20 +ONE_MB = 2 ** 20 -def process(view,opts,modelfile,trainfile,testfile,featurefile,outdir,evaluator): - recsdir = get_recsdir(trainfile,opts.outdir) +def process(view, opts, modelfile, trainfile, testfile, featurefile, outdir, evaluator): + recsdir = get_recsdir(trainfile, opts.outdir) logging.info('creating recs directory {0}...'.format(recsdir)) - subprocess.check_call(['mkdir','-p',recsdir]) + subprocess.check_call(['mkdir', '-p', recsdir]) done = [] if not opts.overwrite: @@ -57,7 +54,7 @@ def process(view,opts,modelfile,trainfile,testfile,featurefile,outdir,evaluator) logging.info('running in parallel across ipython engines...') results = [] - results.append(view.map_async(predict.run,tasks,retries=2)) + results.append(view.map_async(predict.run, tasks, retries=2)) # wait for tasks to complete processed = [r.get() for r in results] @@ -69,10 +66,10 @@ def process(view,opts,modelfile,trainfile,testfile,featurefile,outdir,evaluator) if remaining == 0: logging.info('SUCCESS: all tasks completed') logging.info('concatenating {0} partial output files...'.format(len(done))) - paths = [os.path.join(recsdir,'recs.{0}-{1}.tsv'.format(start,end)) for start,end in done] - cmd = ['cat']+paths - recsfile = get_recsfile(trainfile,outdir) - subprocess.check_call(cmd,stdout=open(recsfile,'w')) + paths = [os.path.join(recsdir, 'recs.{0}-{1}.tsv'.format(start, end)) for start, end in done] + cmd = ['cat'] + paths + recsfile = get_recsfile(trainfile, outdir) + subprocess.check_call(cmd, stdout=open(recsfile, 'w')) logging.info('removing partial output files...') rmtree(recsdir) logging.info('done') @@ -81,18 +78,19 @@ def process(view,opts,modelfile,trainfile,testfile,featurefile,outdir,evaluator) avg_metrics = defaultdict(float) tot_count = 0 for results in processed: - for cum_metrics,count in results: - for m,val in cum_metrics.iteritems(): + for cum_metrics, count in results: + for m, val in cum_metrics.items(): avg_metrics[m] += val tot_count += count for m in avg_metrics: avg_metrics[m] /= float(tot_count) else: - logging.error('FAILED: {0}/{1} tasks did not complete successfully'.format(remaining,len(tasks))) + logging.error('FAILED: {0}/{1} tasks did not complete successfully'.format(remaining, len(tasks))) logging.error('try rerunning the command to retry the remaining tasks') avg_metrics = None - return read_recommender_description(modelfile),avg_metrics + return read_recommender_description(modelfile), avg_metrics + def create_tasks(modelfile, input_format, @@ -105,92 +103,105 @@ def create_tasks(modelfile, mb_per_task, done, evaluator): - users_per_task,num_users = estimate_users_per_task(mb_per_task,input_format,trainfile,modelfile) + users_per_task, num_users = estimate_users_per_task(mb_per_task, input_format, trainfile, modelfile) tasks = [] - for start in xrange(0,num_users,users_per_task): - end = min(num_users,start+users_per_task) - generate = (start,end) not in done - tasks.append((modelfile,input_format,trainfile,test_input_format,testfile,item_feature_format,featurefile,outdir,start,end,evaluator,generate)) - logging.info('created {0} tasks, {1} users per task'.format(len(tasks),users_per_task)) + for start in range(0, num_users, users_per_task): + end = min(num_users, start + users_per_task) + generate = (start, end) not in done + tasks.append((modelfile, input_format, trainfile, test_input_format, testfile, item_feature_format, featurefile, + outdir, start, end, evaluator, generate)) + logging.info('created {0} tasks, {1} users per task'.format(len(tasks), users_per_task)) return tasks -def estimate_users_per_task(mb_per_task,input_format,trainfile,modelfile): - num_users,num_items,nnz = get_dataset_size(input_format,trainfile) + +def estimate_users_per_task(mb_per_task, input_format, trainfile, modelfile): + num_users, num_items, nnz = get_dataset_size(input_format, trainfile) logging.info('loading model to get size...') model = load_recommender(modelfile) # we load the training and test data on every task # - let's guess that worst case the test data will be the same size - required_mb_per_task = 2*(nnz*16)/ONE_MB - if isinstance(model,MatrixFactorizationRecommender): + required_mb_per_task = 2 * (nnz * 16) / ONE_MB + if isinstance(model, MatrixFactorizationRecommender): # we have to load the factors on every task - required_mb_per_task += ((model.U.size+model.V.size)*16)/ONE_MB + required_mb_per_task += ((model.U.size + model.V.size) * 16) / ONE_MB if mb_per_task > required_mb_per_task: # remaining mem usage is dominated by computed scores: - users_per_task = ((mb_per_task-required_mb_per_task)*ONE_MB) / (num_items*16) - elif isinstance(model,ItemSimilarityRecommender): + users_per_task = ((mb_per_task - required_mb_per_task) * ONE_MB) / (num_items * 16) + elif isinstance(model, ItemSimilarityRecommender): # we have to load the similarity matrix on every task - required_mb_per_task += (model.similarity_matrix.nnz*16)/ONE_MB + required_mb_per_task += (model.similarity_matrix.nnz * 16) / ONE_MB if mb_per_task > required_mb_per_task: # estimate additional usage from avg items per user and sims per item items_per_user = nnz / num_users sims_per_item = model.similarity_matrix.nnz / num_items - users_per_task = ((mb_per_task-required_mb_per_task)*ONE_MB) / (items_per_user*sims_per_item*16) + users_per_task = ((mb_per_task - required_mb_per_task) * ONE_MB) / (items_per_user * sims_per_item * 16) else: # assume nothing else to load users_per_task = num_users if mb_per_task <= required_mb_per_task: - raise RuntimeError('requires at least {0}MB per task, increase --mb_per_task if you can'.format(required_mb_per_task)) + raise RuntimeError( + 'requires at least {0}MB per task, increase --mb_per_task if you can'.format(required_mb_per_task)) + + return int(users_per_task), int(num_users) - return users_per_task,num_users -def get_dataset_size(input_format,datafile): +def get_dataset_size(input_format, datafile): logging.info('loading dataset to get size...') - dataset = load_sparse_matrix(input_format,datafile) - return dataset.shape[0],dataset.shape[1],dataset.nnz + dataset = load_sparse_matrix(input_format, datafile) + return dataset.shape[0], dataset.shape[1], dataset.nnz + def find_done(outdir): - success_files = glob.glob(os.path.join(outdir,'*.SUCCESS')) + success_files = glob.glob(os.path.join(outdir, '*.SUCCESS')) r = re.compile('.*?([0-9]+)-([0-9]+)\.SUCCESS$') done = [] for path in success_files: m = r.match(path) start = int(m.group(1)) end = int(m.group(2)) - done.append((start,end)) + done.append((start, end)) return done -def main(): +def main(): import os from optparse import OptionParser - from IPython.parallel import Client + from ipyparallel import Client from mrec.evaluation.metrics import compute_main_metrics, compute_hit_rate from mrec.evaluation import Evaluator - from mrec import load_recommender from mrec.evaluation.metrics import print_report - logging.basicConfig(level=logging.INFO,format='[%(asctime)s] %(levelname)s: %(message)s') + logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s: %(message)s') parser = OptionParser() - parser.add_option('--mb_per_task',dest='mb_per_task',type='int',default=None,help='approximate memory limit per task in MB, so total memory usage is num_engines * mb_per_task (default: share all available RAM across engines)') - parser.add_option('--input_format',dest='input_format',help='format of training dataset(s) tsv | csv | mm (matrixmarket) | fsm (fast_sparse_matrix)') - parser.add_option('--test_input_format',dest='test_input_format',default='npz',help='format of test dataset(s) tsv | csv | mm (matrixmarket) | npz (numpy binary) (default: %default)') - parser.add_option('--train',dest='train',help='glob specifying path(s) to training dataset(s) IMPORTANT: must be in quotes if it includes the * wildcard') - parser.add_option('--modeldir',dest='modeldir',help='directory containing trained models') - parser.add_option('--outdir',dest='outdir',help='directory for output files') - parser.add_option('--metrics',dest='metrics',default='main',help='which set of metrics to compute, main|hitrate (default: %default)') - parser.add_option('--item_feature_format',dest='item_feature_format',help='format of item features tsv | csv | mm (matrixmarket) | npz (numpy arrays)') - parser.add_option('--item_features',dest='item_features',help='path to sparse item features in tsv format (item_id,feature_id,val)') - parser.add_option('--overwrite',dest='overwrite',action='store_true',default=False,help='overwrite existing files in outdir (default: %default)') - parser.add_option('--packer',dest='packer',default='json',help='packer for IPython.parallel (default: %default)') - parser.add_option('--add_module_paths',dest='add_module_paths',help='optional comma-separated list of paths to append to pythonpath (useful if you need to import uninstalled modules to IPython engines on a cluster)') - - metrics_funcs = {'main':compute_main_metrics, - 'hitrate':compute_hit_rate} - - (opts,args) = parser.parse_args() + parser.add_option('--mb_per_task', dest='mb_per_task', type='int', default=None, + help='approximate memory limit per task in MB, so total memory usage is num_engines * mb_per_task (default: share all available RAM across engines)') + parser.add_option('--input_format', dest='input_format', + help='format of training dataset(s) tsv | csv | mm (matrixmarket) | fsm (fast_sparse_matrix)') + parser.add_option('--test_input_format', dest='test_input_format', default='npz', + help='format of test dataset(s) tsv | csv | mm (matrixmarket) | npz (numpy binary) (default: %default)') + parser.add_option('--train', dest='train', + help='glob specifying path(s) to training dataset(s) IMPORTANT: must be in quotes if it includes the * wildcard') + parser.add_option('--modeldir', dest='modeldir', help='directory containing trained models') + parser.add_option('--outdir', dest='outdir', help='directory for output files') + parser.add_option('--metrics', dest='metrics', default='main', + help='which set of metrics to compute, main|hitrate (default: %default)') + parser.add_option('--item_feature_format', dest='item_feature_format', + help='format of item features tsv | csv | mm (matrixmarket) | npz (numpy arrays)') + parser.add_option('--item_features', dest='item_features', + help='path to sparse item features in tsv format (item_id,feature_id,val)') + parser.add_option('--overwrite', dest='overwrite', action='store_true', default=False, + help='overwrite existing files in outdir (default: %default)') + parser.add_option('--packer', dest='packer', default='json', help='packer for IPython.parallel (default: %default)') + parser.add_option('--add_module_paths', dest='add_module_paths', + help='optional comma-separated list of paths to append to pythonpath (useful if you need to import uninstalled modules to IPython engines on a cluster)') + + metrics_funcs = {'main': compute_main_metrics, + 'hitrate': compute_hit_rate} + + (opts, args) = parser.parse_args() if not opts.input_format or not opts.train or not opts.outdir \ or not opts.modeldir or opts.metrics not in metrics_funcs: parser.print_help() @@ -206,7 +217,7 @@ def main(): if opts.mb_per_task is None: import psutil num_engines = len(view) - opts.mb_per_task = psutil.virtual_memory().available/ONE_MB/(num_engines+1) # don't take *all* the memory + opts.mb_per_task = psutil.virtual_memory().available / ONE_MB / (num_engines + 1) # don't take *all* the memory if opts.add_module_paths: c[:].execute('import sys') @@ -214,7 +225,7 @@ def main(): logging.info('adding {0} to pythonpath on all engines'.format(path)) c[:].execute("sys.path.append('{0}')".format(path)) - evaluator = Evaluator(metrics_funcs[opts.metrics],max_items=20) + evaluator = Evaluator(metrics_funcs[opts.metrics], max_items=20) trainfiles = glob.glob(opts.train) @@ -222,9 +233,10 @@ def main(): all_metrics = defaultdict(list) for trainfile in trainfiles: logging.info('processing {0}...'.format(trainfile)) - modelfile = get_modelfile(trainfile,opts.modeldir) + modelfile = get_modelfile(trainfile, opts.modeldir) testfile = get_testfile(trainfile) - description,metrics = process(view,opts,modelfile,trainfile,testfile,opts.item_features,opts.outdir,evaluator) + description, metrics = process(view, opts, modelfile, trainfile, testfile, opts.item_features, opts.outdir, + evaluator) descriptions.add(description) if metrics is not None: for m in metrics: @@ -232,9 +244,10 @@ def main(): description = ' AND '.join(descriptions) if len(descriptions) > 1: - logging.warn('You are aggregating metrics from different models! {}'.format(description)) + logging.warning('You are aggregating metrics from different models! {}'.format(description)) + + print_report([description], [all_metrics]) - print_report([description],[all_metrics]) if __name__ == '__main__': main() diff --git a/mrec/examples/prepare.py b/mrec/examples/prepare.py index a7fe6e0..ec1ac45 100644 --- a/mrec/examples/prepare.py +++ b/mrec/examples/prepare.py @@ -1,44 +1,44 @@ class Processor(object): - - def __init__(self,splitter,parser,min_items_per_user,preprocess=None): + def __init__(self, splitter, parser, min_items_per_user, preprocess=None): self.splitter = splitter self.parser = parser self.min_items_per_user = min_items_per_user self.preprocess = preprocess - def output(self,user,vals,outfile): - for v,c in vals: - print >>outfile,'{0}\t{1}\t{2}'.format(user,v,c) + def output(self, user, vals, outfile): + for v, c in vals: + print('{0}\t{1}\t{2}'.format(user, v, c), file=outfile) - def handle(self,user,vals): + def handle(self, user, vals): if len(vals) >= self.min_items_per_user: if self.preprocess is not None: vals = self.preprocess(vals) - train,test = self.splitter.handle(vals) - self.output(user,train,self.train_out) - self.output(user,test,self.test_out) + train, test = self.splitter.handle(vals) + self.output(user, train, self.train_out) + self.output(user, test, self.test_out) else: self.too_few_items += 1 - def create_split(self,infile,train_out,test_out): + def create_split(self, infile, train_out, test_out): self.train_out = train_out self.test_out = test_out self.too_few_items = 0 last_user = None vals = [] for line in infile: - user,val = self.parser.parse(line) + user, val = self.parser.parse(line) if user != last_user: if last_user is not None: - self.handle(last_user,vals) + self.handle(last_user, vals) last_user = user vals = [] vals.append(val) - self.handle(last_user,vals) + self.handle(last_user, vals) def get_too_few_items(self): return self.too_few_items + def main(): import os import logging @@ -46,24 +46,30 @@ def main(): from optparse import OptionParser from mrec.evaluation.preprocessing import TSVParser, SplitCreator - from filename_conventions import get_sortedfile, get_splitfile + from mrec.examples.filename_conventions import get_sortedfile, get_splitfile - logging.basicConfig(level=logging.INFO,format='[%(asctime)s] %(levelname)s: %(message)s') + logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s: %(message)s') parser = OptionParser() - parser.add_option('--dataset',dest='dataset',help='path to input dataset in tsv format') - parser.add_option('--delimiter',dest='delimiter',default='\t',help='input delimiter (default: tab)') - parser.add_option('--outdir',dest='outdir',help='directory for output files') - parser.add_option('--num_splits',dest='num_splits',type='int',default=5,help='number of train/test splits to create (default: %default)') - parser.add_option('--min_items_per_user',dest='min_items_per_user',type='int',default=10,help='skip users with less than this number of ratings (default: %default)') - parser.add_option('--binarize',dest='binarize',action='store_true',default=False,help='binarize ratings') - parser.add_option('--normalize',dest='normalize',action='store_true',help='scale training ratings to unit norm') - parser.add_option('--rating_thresh',dest='rating_thresh',type='float',default=0,help='treat ratings below this as zero (default: %default)') - parser.add_option('--test_size',dest='test_size',type='float',default=0.5,help='target number of test items for each user, if test_size >= 1 treat as an absolute number, otherwise treat as a fraction of the total items (default: %default)') - parser.add_option('--discard_zeros',dest='discard_zeros',action='store_true',help='discard zero training ratings after thresholding (not recommended, incompatible with using training items to guarantee that recommendations are novel)') - parser.add_option('--sample_before_thresholding',dest='sample_before_thresholding',action='store_true',help='choose test items before thresholding ratings (not recommended, test items below threshold will then be discarded)') - - (opts,args) = parser.parse_args() + parser.add_option('--dataset', dest='dataset', help='path to input dataset in tsv format') + parser.add_option('--delimiter', dest='delimiter', default='\t', help='input delimiter (default: tab)') + parser.add_option('--outdir', dest='outdir', help='directory for output files') + parser.add_option('--num_splits', dest='num_splits', type='int', default=5, + help='number of train/test splits to create (default: %default)') + parser.add_option('--min_items_per_user', dest='min_items_per_user', type='int', default=10, + help='skip users with less than this number of ratings (default: %default)') + parser.add_option('--binarize', dest='binarize', action='store_true', default=False, help='binarize ratings') + parser.add_option('--normalize', dest='normalize', action='store_true', help='scale training ratings to unit norm') + parser.add_option('--rating_thresh', dest='rating_thresh', type='float', default=0, + help='treat ratings below this as zero (default: %default)') + parser.add_option('--test_size', dest='test_size', type='float', default=0.5, + help='target number of test items for each user, if test_size >= 1 treat as an absolute number, otherwise treat as a fraction of the total items (default: %default)') + parser.add_option('--discard_zeros', dest='discard_zeros', action='store_true', + help='discard zero training ratings after thresholding (not recommended, incompatible with using training items to guarantee that recommendations are novel)') + parser.add_option('--sample_before_thresholding', dest='sample_before_thresholding', action='store_true', + help='choose test items before thresholding ratings (not recommended, test items below threshold will then be discarded)') + + (opts, args) = parser.parse_args() if not opts.dataset or not opts.outdir: parser.print_help() raise SystemExit @@ -72,30 +78,30 @@ def main(): opts.outdir = os.path.abspath(opts.outdir) logging.info('sorting input data...') - infile = get_sortedfile(opts.dataset,opts.outdir) - subprocess.check_call(['mkdir','-p',opts.outdir]) - subprocess.check_call(['sort','-k1','-n',opts.dataset],stdout=open(infile,'w')) + infile = get_sortedfile(opts.dataset, opts.outdir) + subprocess.check_call(['mkdir', '-p', opts.outdir]) + subprocess.check_call(['sort', '-k1', '-n', opts.dataset], stdout=open(infile, 'w')) - parser = TSVParser(thresh=opts.rating_thresh,binarize=opts.binarize,delimiter=opts.delimiter) - splitter = SplitCreator(test_size=opts.test_size,normalize=opts.normalize,discard_zeros=opts.discard_zeros, + parser = TSVParser(thresh=opts.rating_thresh, binarize=opts.binarize, delimiter=opts.delimiter) + splitter = SplitCreator(test_size=opts.test_size, normalize=opts.normalize, discard_zeros=opts.discard_zeros, sample_before_thresholding=opts.sample_before_thresholding) - processor = Processor(splitter,parser,opts.min_items_per_user) + processor = Processor(splitter, parser, opts.min_items_per_user) - for i in xrange(opts.num_splits): - trainfile = get_splitfile(opts.dataset,opts.outdir,'train',i) - testfile = get_splitfile(opts.dataset,opts.outdir,'test',i) + for i in range(opts.num_splits): + trainfile = get_splitfile(opts.dataset, opts.outdir, 'train', i) + testfile = get_splitfile(opts.dataset, opts.outdir, 'test', i) - logging.info('creating split {0}: {1} {2}'.format(i,trainfile,testfile)) - processor.create_split(open(infile),open(trainfile,'w'),open(testfile,'w')) + logging.info('creating split {0}: {1} {2}'.format(i, trainfile, testfile)) + processor.create_split(open(infile), open(trainfile, 'w'), open(testfile, 'w')) too_few_items = processor.get_too_few_items() - if (too_few_items): - logging.info('skipped {0} users with less than {1} ratings'.format(too_few_items,opts.min_items_per_user)) + if too_few_items: + logging.info('skipped {0} users with less than {1} ratings'.format(too_few_items, opts.min_items_per_user)) logging.info('cleaning up...') - subprocess.check_call(['rm',infile]) + subprocess.check_call(['rm', infile]) logging.info('done') + if __name__ == '__main__': main() - diff --git a/mrec/examples/train.py b/mrec/examples/train.py index 7fe3c92..2588be9 100644 --- a/mrec/examples/train.py +++ b/mrec/examples/train.py @@ -10,16 +10,16 @@ easy to generate data for cross-validated evaluation. """ -from filename_conventions import * +from mrec.examples.filename_conventions import * -def main(): +def main(): import os import logging import glob import subprocess from optparse import OptionParser - from IPython.parallel import Client + from ipyparallel import Client from mrec import load_fast_sparse_matrix, save_recommender from mrec.item_similarity.slim import SLIM @@ -32,34 +32,54 @@ def main(): from mrec.parallel.wrmf import WRMFRunner from mrec.parallel.warp import WARPMFRunner - logging.basicConfig(level=logging.INFO,format='[%(asctime)s] %(levelname)s: %(message)s') + logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s: %(message)s') parser = OptionParser() - parser.add_option('-n','--num_engines',dest='num_engines',type='int',default=0,help='number of IPython engines to use') - parser.add_option('--input_format',dest='input_format',help='format of training dataset(s) tsv | csv | mm (matrixmarket) | fsm (fast_sparse_matrix)') - parser.add_option('--train',dest='train',help='glob specifying path(s) to training dataset(s) IMPORTANT: must be in quotes if it includes the * wildcard') - parser.add_option('--outdir',dest='outdir',help='directory for output files') - parser.add_option('--overwrite',dest='overwrite',action='store_true',help='overwrite existing files in outdir') - parser.add_option('--model',dest='model',default='slim',help='type of model to train: slim | knn | wrmf | warp | popularity (default: %default)') - parser.add_option('--max_sims',dest='max_sims',type='int',default=100,help='max similar items to output for each training item (default: %default)') - parser.add_option('--learner',dest='learner',default='sgd',help='underlying learner for SLIM learner: sgd | elasticnet | fs_sgd (default: %default)') - parser.add_option('--l1_reg',dest='l1_reg',type='float',default=0.001,help='l1 regularization constant (default: %default)') - parser.add_option('--l2_reg',dest='l2_reg',type='float',default=0.0001,help='l2 regularization constant (default: %default)') - parser.add_option('--metric',dest='metric',default='cosine',help='metric for knn recommender: cosine | dot (default: %default)') - parser.add_option('--num_factors',dest='num_factors',type='int',default=80,help='number of latent factors (default: %default)') - parser.add_option('--alpha',dest='alpha',type='float',default=1.0,help='wrmf confidence constant (default: %default)') - parser.add_option('--lbda',dest='lbda',type='float',default=0.015,help='wrmf regularization constant (default: %default)') - parser.add_option('--als_iters',dest='als_iters',type='int',default=15,help='number of als iterations (default: %default)') - parser.add_option('--gamma',dest='gamma',type='float',default=0.01,help='warp learning rate (default: %default)') - parser.add_option('--C',dest='C',type='float',default=100.0,help='warp regularization constant (default: %default)') - parser.add_option('--item_feature_format',dest='item_feature_format',help='format of item features tsv | csv | mm (matrixmarket) | npz (numpy arrays)') - parser.add_option('--item_features',dest='item_features',help='path to sparse item features in tsv format (item_id,feature_id,val)') - parser.add_option('--popularity_method',dest='popularity_method',default='count',help='how to compute popularity for baseline recommender: count | sum | avg | thresh (default: %default)') - parser.add_option('--popularity_thresh',dest='popularity_thresh',type='float',default=0,help='ignore scores below this when computing popularity for baseline recommender (default: %default)') - parser.add_option('--packer',dest='packer',default='json',help='packer for IPython.parallel (default: %default)') - parser.add_option('--add_module_paths',dest='add_module_paths',help='optional comma-separated list of paths to append to pythonpath (useful if you need to import uninstalled modules to IPython engines on a cluster)') - - (opts,args) = parser.parse_args() + parser.add_option('-n', '--num_engines', dest='num_engines', type='int', default=0, + help='number of IPython engines to use') + parser.add_option('--input_format', dest='input_format', + help='format of training dataset(s) tsv | csv | mm (matrixmarket) | fsm (fast_sparse_matrix)') + parser.add_option('--train', dest='train', + help='glob specifying path(s) to training dataset(s) IMPORTANT: must be in quotes if it includes the * wildcard') + parser.add_option('--outdir', dest='outdir', help='directory for output files') + parser.add_option('--overwrite', dest='overwrite', action='store_true', help='overwrite existing files in outdir') + parser.add_option('--model', dest='model', default='slim', + help='type of model to train: slim | knn | wrmf | warp | popularity (default: %default)') + parser.add_option('--max_sims', dest='max_sims', type='int', default=100, + help='max similar items to output for each training item (default: %default)') + parser.add_option('--learner', dest='learner', default='sgd', + help='underlying learner for SLIM learner: sgd | elasticnet | fs_sgd (default: %default)') + parser.add_option('--l1_reg', dest='l1_reg', type='float', default=0.001, + help='l1 regularization constant (default: %default)') + parser.add_option('--l2_reg', dest='l2_reg', type='float', default=0.0001, + help='l2 regularization constant (default: %default)') + parser.add_option('--metric', dest='metric', default='cosine', + help='metric for knn recommender: cosine | dot (default: %default)') + parser.add_option('--num_factors', dest='num_factors', type='int', default=80, + help='number of latent factors (default: %default)') + parser.add_option('--alpha', dest='alpha', type='float', default=1.0, + help='wrmf confidence constant (default: %default)') + parser.add_option('--lbda', dest='lbda', type='float', default=0.015, + help='wrmf regularization constant (default: %default)') + parser.add_option('--als_iters', dest='als_iters', type='int', default=15, + help='number of als iterations (default: %default)') + parser.add_option('--gamma', dest='gamma', type='float', default=0.01, + help='warp learning rate (default: %default)') + parser.add_option('--C', dest='C', type='float', default=100.0, + help='warp regularization constant (default: %default)') + parser.add_option('--item_feature_format', dest='item_feature_format', + help='format of item features tsv | csv | mm (matrixmarket) | npz (numpy arrays)') + parser.add_option('--item_features', dest='item_features', + help='path to sparse item features in tsv format (item_id,feature_id,val)') + parser.add_option('--popularity_method', dest='popularity_method', default='count', + help='how to compute popularity for baseline recommender: count | sum | avg | thresh (default: %default)') + parser.add_option('--popularity_thresh', dest='popularity_thresh', type='float', default=0, + help='ignore scores below this when computing popularity for baseline recommender (default: %default)') + parser.add_option('--packer', dest='packer', default='json', help='packer for IPython.parallel (default: %default)') + parser.add_option('--add_module_paths', dest='add_module_paths', + help='optional comma-separated list of paths to append to pythonpath (useful if you need to import uninstalled modules to IPython engines on a cluster)') + + (opts, args) = parser.parse_args() if not opts.input_format or not opts.train or not opts.outdir or not opts.num_engines: parser.print_help() raise SystemExit @@ -71,14 +91,14 @@ def main(): if opts.model == 'popularity': # special case, don't need to run in parallel - subprocess.check_call(['mkdir','-p',opts.outdir]) + subprocess.check_call(['mkdir', '-p', opts.outdir]) for trainfile in trainfiles: logging.info('processing {0}...'.format(trainfile)) - model = ItemPopularityRecommender(method=opts.popularity_method,thresh=opts.popularity_thresh) - dataset = load_fast_sparse_matrix(opts.input_format,trainfile) + model = ItemPopularityRecommender(method=opts.popularity_method, thresh=opts.popularity_thresh) + dataset = load_fast_sparse_matrix(opts.input_format, trainfile) model.fit(dataset) - modelfile = get_modelfile(trainfile,opts.outdir) - save_recommender(model,modelfile) + modelfile = get_modelfile(trainfile, opts.outdir) + save_recommender(model, modelfile) logging.info('done') return @@ -95,9 +115,10 @@ def main(): if opts.model == 'slim': if opts.learner == 'fs_sgd': num_selected_features = 2 * opts.max_sims # preselect this many candidate similar items - model = SLIM(l1_reg=opts.l1_reg,l2_reg=opts.l2_reg,model=opts.learner,num_selected_features=num_selected_features) + model = SLIM(l1_reg=opts.l1_reg, l2_reg=opts.l2_reg, model=opts.learner, + num_selected_features=num_selected_features) else: - model = SLIM(l1_reg=opts.l1_reg,l2_reg=opts.l2_reg,model=opts.learner) + model = SLIM(l1_reg=opts.l1_reg, l2_reg=opts.l2_reg, model=opts.learner) elif opts.model == 'knn': if opts.metric == 'cosine': model = CosineKNNRecommender(k=opts.max_sims) @@ -107,33 +128,36 @@ def main(): parser.print_help() raise SystemExit('unknown metric: {0}'.format(opts.metric)) elif opts.model == 'wrmf': - model = WRMFRecommender(d=opts.num_factors,alpha=opts.alpha,lbda=opts.lbda,num_iters=opts.als_iters) + model = WRMFRecommender(d=opts.num_factors, alpha=opts.alpha, lbda=opts.lbda, num_iters=opts.als_iters) elif opts.model == 'warp': - num_factors_per_engine = max(opts.num_factors/opts.num_engines,1) + num_factors_per_engine = max(opts.num_factors / opts.num_engines, 1) if opts.item_features: - model = WARP2MFRecommender(d=num_factors_per_engine,gamma=opts.gamma,C=opts.C) + model = WARP2MFRecommender(d=num_factors_per_engine, gamma=opts.gamma, C=opts.C) else: - model = WARPMFRecommender(d=num_factors_per_engine,gamma=opts.gamma,C=opts.C) + model = WARPMFRecommender(d=num_factors_per_engine, gamma=opts.gamma, C=opts.C) else: parser.print_help() raise SystemExit('unknown model type: {0}'.format(opts.model)) for trainfile in trainfiles: logging.info('processing {0}...'.format(trainfile)) - modelfile = get_modelfile(trainfile,opts.outdir) + modelfile = get_modelfile(trainfile, opts.outdir) if opts.model == 'wrmf': runner = WRMFRunner() - factorsdir = get_factorsdir(trainfile,opts.outdir) - runner.run(view,model,opts.input_format,trainfile,opts.num_engines,factorsdir,modelfile) + factorsdir = get_factorsdir(trainfile, opts.outdir) + runner.run(view, model, opts.input_format, trainfile, opts.num_engines, factorsdir, modelfile) elif opts.model == 'warp': runner = WARPMFRunner() - modelsdir = get_modelsdir(trainfile,opts.outdir) - runner.run(view,model,opts.input_format,trainfile,opts.item_feature_format,opts.item_features,opts.num_engines,modelsdir,opts.overwrite,modelfile) + modelsdir = get_modelsdir(trainfile, opts.outdir) + runner.run(view, model, opts.input_format, trainfile, opts.item_feature_format, opts.item_features, + opts.num_engines, modelsdir, opts.overwrite, modelfile) else: runner = ItemSimilarityRunner() - simsdir = get_simsdir(trainfile,opts.outdir) - simsfile = get_simsfile(trainfile,opts.outdir) - runner.run(view,model,opts.input_format,trainfile,opts.num_engines,simsdir,opts.overwrite,opts.max_sims,simsfile,modelfile) + simsdir = get_simsdir(trainfile, opts.outdir) + simsfile = get_simsfile(trainfile, opts.outdir) + runner.run(view, model, opts.input_format, trainfile, opts.num_engines, simsdir, opts.overwrite, + opts.max_sims, simsfile, modelfile) + if __name__ == '__main__': main() diff --git a/mrec/examples/tune_slim.py b/mrec/examples/tune_slim.py index 45a9762..4e72d8d 100644 --- a/mrec/examples/tune_slim.py +++ b/mrec/examples/tune_slim.py @@ -3,76 +3,87 @@ constants for SLIM by looking at model sparsity. """ +import logging import random from math import log10 -import logging from operator import itemgetter from optparse import OptionParser -try: - from sklearn.grid_search import ParameterGrid -except ImportError: - from sklearn.grid_search import IterGrid as ParameterGrid -from IPython.parallel import Client + +from ipyparallel import Client +from sklearn.model_selection import ParameterGrid from mrec import load_fast_sparse_matrix + def estimate_sparsity(task): from mrec.item_similarity.slim import SLIM - args,dataset,min_nnz,sample_items = task + args, dataset, min_nnz, sample_items = task model = SLIM(**args) tot_nnz = 0 tot_neg = 0 below_min_nnz = 0 for i in sample_items: - w = model.compute_similarities(dataset,i) - nnz = sum(w>0) + w = model.compute_similarities(dataset, i) + nnz = sum(w > 0) tot_nnz += nnz if nnz < min_nnz: below_min_nnz += 1 - tot_neg += sum(w<0) + tot_neg += sum(w < 0) num_samples = len(sample_items) - avg_nnz = float(tot_nnz)/num_samples - too_few_sims = float(below_min_nnz)/num_samples - avg_neg = float(tot_neg)/num_samples - return args,avg_nnz,too_few_sims,avg_neg + avg_nnz = float(tot_nnz) / num_samples + too_few_sims = float(below_min_nnz) / num_samples + avg_neg = float(tot_neg) / num_samples + return args, avg_nnz, too_few_sims, avg_neg + + +def pow_range(small, big): + return [10 ** v for v in range(int(log10(small)), int(log10(big)) + 1)] -def pow_range(small,big): - return [10**v for v in xrange(int(log10(small)),int(log10(big))+1)] def main(): parser = OptionParser() - parser.add_option('-d','--dataset',dest='dataset',help='path to dataset') - parser.add_option('--input_format',dest='input_format',help='format of training dataset(s) tsv | csv | mm (matrixmarket) | fsm (fast_sparse_matrix)') - parser.add_option('--l1_min',dest='l1_min',type='float',help='min l1 constant to try (expected to be a power of 10)') - parser.add_option('--l1_max',dest='l1_max',type='float',help='max l1 constant to try (expected to be a power of 10)') - parser.add_option('--l2_min',dest='l2_min',type='float',help='min l2 constant to try (expected to be a power of 10)') - parser.add_option('--l2_max',dest='l2_max',type='float',help='max l2 constant to try (expected to be a power of 10)') - parser.add_option('--max_sims',dest='max_sims',type='int',default=2000,help='max desired number of positive item similarity weights (default: %default)') - parser.add_option('--min_sims',dest='min_sims',type='int',default=15,help='min desired number of positive item similarity weights (default: %default)') - parser.add_option('--max_sparse',dest='max_sparse',type='float',default=0.01,help='max allowable proportion of items with less than min_sims positive similarity weights (default: %default)') - parser.add_option('--num_samples',dest='num_samples',type='int',default=100,help='number of sample items to evaluate for each regularization setting') - parser.add_option('--packer',dest='packer',default='json',help='packer for IPython.parallel (default: %default)') - parser.add_option('--add_module_paths',dest='add_module_paths',help='comma-separated list of paths to append to pythonpath to enable import of uninstalled modules') - - (opts,args) = parser.parse_args() + parser.add_option('-d', '--dataset', dest='dataset', help='path to dataset') + parser.add_option('--input_format', dest='input_format', + help='format of training dataset(s) tsv | csv | mm (matrixmarket) | fsm (fast_sparse_matrix)') + parser.add_option('--l1_min', dest='l1_min', type='float', + help='min l1 constant to try (expected to be a power of 10)') + parser.add_option('--l1_max', dest='l1_max', type='float', + help='max l1 constant to try (expected to be a power of 10)') + parser.add_option('--l2_min', dest='l2_min', type='float', + help='min l2 constant to try (expected to be a power of 10)') + parser.add_option('--l2_max', dest='l2_max', type='float', + help='max l2 constant to try (expected to be a power of 10)') + parser.add_option('--max_sims', dest='max_sims', type='int', default=2000, + help='max desired number of positive item similarity weights (default: %default)') + parser.add_option('--min_sims', dest='min_sims', type='int', default=15, + help='min desired number of positive item similarity weights (default: %default)') + parser.add_option('--max_sparse', dest='max_sparse', type='float', default=0.01, + help='max allowable proportion of items with less than min_sims positive similarity weights (default: %default)') + parser.add_option('--num_samples', dest='num_samples', type='int', default=100, + help='number of sample items to evaluate for each regularization setting') + parser.add_option('--packer', dest='packer', default='json', help='packer for IPython.parallel (default: %default)') + parser.add_option('--add_module_paths', dest='add_module_paths', + help='comma-separated list of paths to append to pythonpath to enable import of uninstalled modules') + + (opts, args) = parser.parse_args() if not opts.dataset or not opts.input_format or not opts.l1_min or not opts.l1_max or not opts.l2_min or not opts.l2_max: parser.print_help() raise SystemExit - logging.basicConfig(level=logging.INFO,format='[%(asctime)s] %(levelname)s: %(message)s') + logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s: %(message)s') - dataset = load_fast_sparse_matrix(opts.input_format,opts.dataset) + dataset = load_fast_sparse_matrix(opts.input_format, opts.dataset) - params = {'l1_reg':pow_range(opts.l1_min,opts.l1_max), - 'l2_reg':pow_range(opts.l2_min,opts.l2_max)} + params = {'l1_reg': pow_range(opts.l1_min, opts.l1_max), + 'l2_reg': pow_range(opts.l2_min, opts.l2_max)} num_items = dataset.shape[1] - sample_items = random.sample(xrange(num_items),opts.num_samples) + sample_items = random.sample(range(num_items), opts.num_samples) logging.info('preparing tasks for a grid search of these values:') logging.info(params) - tasks = [(args,dataset,opts.min_sims,sample_items) for args in ParameterGrid(params)] + tasks = [(args, dataset, opts.min_sims, sample_items) for args in ParameterGrid(params)] c = Client(packer=opts.packer) view = c.load_balanced_view() @@ -84,19 +95,22 @@ def main(): c[:].execute("sys.path.append('{0}')".format(path)) logging.info('running {0} tasks in parallel...'.format(len(tasks))) - results = view.map(estimate_sparsity,tasks,ordered=False) + results = view.map(estimate_sparsity, tasks, ordered=False) - candidates = [(args,nsims,nsparse,nneg) for args,nsims,nsparse,nneg in results if nsims <= opts.max_sims and nsparse <= opts.max_sparse] + candidates = [(args, nsims, nsparse, nneg) for args, nsims, nsparse, nneg in results if + nsims <= opts.max_sims and nsparse <= opts.max_sparse] if candidates: - best = min(candidates,key=itemgetter(1)) + best = min(candidates, key=itemgetter(1)) - print 'best parameter setting: {0}'.format(best[0]) - print 'mean # positive similarity weights per item = {0:.3}'.format(best[1]) - print 'proportion of items with fewer than {0} positive similarity weights = {1:.3}'.format(opts.min_sims,best[2]) - print 'mean # negative similarity weights per item = {0:.3}'.format(best[3]) + print('best parameter setting: {0}'.format(best[0])) + print('mean # positive similarity weights per item = {0:.3}'.format(best[1])) + print('proportion of items with fewer than {0} positive similarity weights = {1:.3}'.format(opts.min_sims, + best[2])) + print('mean # negative similarity weights per item = {0:.3}'.format(best[3])) else: - print 'no parameter settings satisfied the conditions, try increasing --min_sims, --max_sims or --max_sparse' + print('no parameter settings satisfied the conditions, try increasing --min_sims, --max_sims or --max_sparse') + if __name__ == '__main__': main() diff --git a/mrec/item_similarity/knn.py b/mrec/item_similarity/knn.py index 542dbda..d620078 100644 --- a/mrec/item_similarity/knn.py +++ b/mrec/item_similarity/knn.py @@ -5,7 +5,9 @@ import numpy as np from sklearn.metrics.pairwise import cosine_similarity -from recommender import ItemSimilarityRecommender + +from mrec.item_similarity.recommender import ItemSimilarityRecommender + class KNNRecommender(ItemSimilarityRecommender): """ @@ -18,21 +20,21 @@ class KNNRecommender(ItemSimilarityRecommender): The number of nearest neighbouring items to retain """ - def __init__(self,k): + def __init__(self, k): self.k = k - def compute_similarities(self,dataset,j): + def compute_similarities(self, dataset, j): A = dataset.X a = dataset.fast_get_col(j) - d = self.compute_all_similarities(A,a) + d = self.compute_all_similarities(A, a) d[j] = 0 # zero out self-similarity # now zero out similarities for all but top-k items - nn = d.argsort()[-1:-1-self.k:-1] + nn = d.argsort()[-1:-1 - self.k:-1] w = np.zeros(A.shape[1]) w[nn] = d[nn] return w - def compute_all_similarities(self,A,a): + def compute_all_similarities(self, A, a): """ Compute similarity scores between item vector a and all the rows of A. @@ -51,29 +53,32 @@ def compute_all_similarities(self,A,a): """ pass + class DotProductKNNRecommender(KNNRecommender): """ Similarity between two items is their dot product (i.e. cooccurrence count if input data is binary). """ - def compute_all_similarities(self,A,a): + def compute_all_similarities(self, A, a): return A.T.dot(a).toarray().flatten() def __str__(self): return 'DotProductKNNRecommender(k={0})'.format(self.k) + class CosineKNNRecommender(KNNRecommender): """ Similarity between two items is their cosine distance. """ - def compute_all_similarities(self,A,a): - return cosine_similarity(A.T,a.T).flatten() + def compute_all_similarities(self, A, a): + return cosine_similarity(A.T, a.T).flatten() def __str__(self): return 'CosineKNNRecommender(k={0})'.format(self.k) + if __name__ == '__main__': # use knn models like this: @@ -84,7 +89,7 @@ def __str__(self): random.seed(0) - print 'loading test data...' + print('loading test data...') data = """\ %%MatrixMarket matrix coordinate real general 3 5 9 @@ -98,44 +103,44 @@ def __str__(self): 3 3 1 3 4 1 """ - print data - dataset = load_fast_sparse_matrix('mm',StringIO.StringIO(data)) - num_users,num_items = dataset.shape + print(data) + dataset = load_fast_sparse_matrix('mm', StringIO.StringIO(data)) + num_users, num_items = dataset.shape model = CosineKNNRecommender(k=2) num_samples = 2 - def output(i,j,val): + def output(i, j, val): # convert back to 1-indexed - print '{0}\t{1}\t{2:.3f}'.format(i+1,j+1,val) + print('{0}\t{1}\t{2:.3f}'.format(i + 1, j + 1, val)) - print 'computing some item similarities...' - print 'item\tsim\tweight' + print('computing some item similarities...') + print('item\tsim\tweight') # if we want we can compute these individually without calling fit() - for i in random.sample(xrange(num_items),num_samples): - for j,weight in model.get_similar_items(i,max_similar_items=2,dataset=dataset): - output(i,j,weight) + for i in random.sample(range(num_items), num_samples): + for j, weight in model.get_similar_items(i, max_similar_items=2, dataset=dataset): + output(i, j, weight) - print 'learning entire similarity matrix...' + print('learning entire similarity matrix...') # more usually we just call train() on the entire dataset model = CosineKNNRecommender(k=2) model.fit(dataset) - print 'making some recommendations...' - print 'user\trec\tscore' - for u in random.sample(xrange(num_users),num_samples): - for i,score in model.recommend_items(dataset.X,u,max_items=10): - output(u,i,score) + print('making some recommendations...') + print('user\trec\tscore') + for u in random.sample(range(num_users), num_samples): + for i, score in model.recommend_items(dataset.X, u, max_items=10): + output(u, i, score) - print 'making batch recommendations...' + print('making batch recommendations...') recs = model.batch_recommend_items(dataset.X) - for u in xrange(num_users): - for i,score in recs[u]: - output(u,i,score) - - print 'making range recommendations...' - for start,end in [(0,2),(2,3)]: - recs = model.range_recommend_items(dataset.X,start,end) - for u in xrange(start,end): - for i,score in recs[u-start]: - output(u,i,score) + for u in range(num_users): + for i, score in recs[u]: + output(u, i, score) + + print('making range recommendations...') + for start, end in [(0, 2), (2, 3)]: + recs = model.range_recommend_items(dataset.X, start, end) + for u in range(start, end): + for i, score in recs[u - start]: + output(u, i, score) diff --git a/mrec/item_similarity/precomputed.py b/mrec/item_similarity/precomputed.py index f083434..ea2bcb1 100644 --- a/mrec/item_similarity/precomputed.py +++ b/mrec/item_similarity/precomputed.py @@ -2,7 +2,8 @@ Make recommendations from a precomputed item similarity matrix. """ -from recommender import ItemSimilarityRecommender +from mrec.item_similarity.recommender import ItemSimilarityRecommender + class PrecomputedItemSimilarityRecommender(ItemSimilarityRecommender): """ @@ -16,18 +17,17 @@ class PrecomputedItemSimilarityRecommender(ItemSimilarityRecommender): The precomputed item similarity matrix. """ - - def __init__(self,description,similarity_matrix): + def __init__(self, description, similarity_matrix): self.description = description self.set_similarity_matrix(similarity_matrix) - def set_similarity_matrix(self,similarity_matrix): + def set_similarity_matrix(self, similarity_matrix): self.similarity_matrix = similarity_matrix - def compute_similarities(self,j): - return self.similarity_matrix[j,:] + def compute_similarities(self, j): + return self.similarity_matrix[j, :] - def fit(self,dataset,item_features=None): + def fit(self, dataset, item_features=None): pass def __str__(self): diff --git a/mrec/item_similarity/recommender.py b/mrec/item_similarity/recommender.py index 4199b5d..94f8aac 100644 --- a/mrec/item_similarity/recommender.py +++ b/mrec/item_similarity/recommender.py @@ -2,17 +2,15 @@ Base class for item similarity recommenders. """ -try: - import cPickle as pickle -except ImportError: - import pickle -import numpy as np -from itertools import izip +import pickle from operator import itemgetter + +import numpy as np from scipy.sparse import csr_matrix, coo_matrix -from ..sparse import fast_sparse_matrix -from ..base_recommender import BaseRecommender +from mrec.base_recommender import BaseRecommender +from mrec.sparse import fast_sparse_matrix + class ItemSimilarityRecommender(BaseRecommender): """ @@ -21,7 +19,7 @@ class ItemSimilarityRecommender(BaseRecommender): need to supply the compute_similarities() method. """ - def fit(self,dataset,item_features=None): + def fit(self, dataset, item_features=None): """ Learn the complete similarity matrix from a user-item matrix. @@ -33,22 +31,22 @@ def fit(self,dataset,item_features=None): item_features : array_like, shape = [num_items, num_features] Features for items in training set, ignored here. """ - if not isinstance(dataset,fast_sparse_matrix): + if not isinstance(dataset, fast_sparse_matrix): dataset = fast_sparse_matrix(dataset) - num_users,num_items = dataset.shape + num_users, num_items = dataset.shape # build up a sparse similarity matrix data = [] row = [] col = [] - for j in xrange(num_items): - w = self.compute_similarities(dataset,j) - for k,v in enumerate(w): + for j in range(num_items): + w = self.compute_similarities(dataset, j) + for k, v in enumerate(w): if v != 0: data.append(v) row.append(j) col.append(k) - idx = np.array([row,col],dtype='int32') - self.similarity_matrix = csr_matrix((data,idx),(num_items,num_items)) + idx = np.array([row, col], dtype='int32') + self.similarity_matrix = csr_matrix((data, idx), (num_items, num_items)) def _create_archive(self): """ @@ -66,17 +64,17 @@ def _create_archive(self): self.similarity_matrix = None m = pickle.dumps(self) self.similarity_matrix = tmp - if isinstance(self.similarity_matrix,np.ndarray): - archive = {'mat':self.similarity_matrix,'model':m} - elif isinstance(self.similarity_matrix,csr_matrix): + if isinstance(self.similarity_matrix, np.ndarray): + archive = {'mat': self.similarity_matrix, 'model': m} + elif isinstance(self.similarity_matrix, csr_matrix): d = self.similarity_matrix.tocoo(copy=False) - archive = {'row':d.row,'col':d.col,'data':d.data,'shape':d.shape,'model':m} + archive = {'row': d.row, 'col': d.col, 'data': d.data, 'shape': d.shape, 'model': m} else: # similarity matrix has unexpected type archive = None return archive - def _load_archive(self,archive): + def _load_archive(self, archive): """ Load fields from a numpy archive. """ @@ -87,11 +85,11 @@ def _load_archive(self,archive): row = archive['row'] col = archive['col'] shape = archive['shape'] - self.similarity_matrix = coo_matrix((data,(row,col)),shape=shape).tocsr() + self.similarity_matrix = coo_matrix((data, (row, col)), shape=shape).tocsr() else: raise IOError('unexpected serialization format, cannot find similarity matrix') - def load_similarity_matrix(self,filepath,num_items,offset=1): + def load_similarity_matrix(self, filepath, num_items, offset=1): """ Load a precomputed similarity matrix from tsv. @@ -105,13 +103,13 @@ def load_similarity_matrix(self,filepath,num_items,offset=1): Item index offset i.e. 1 if indices in file are 1-indexed. """ y = np.loadtxt(filepath) - row = y[:,0] - col = y[:,1] - data = y[:,2] - idx = np.array([row,col],dtype='int32')-offset - self.similarity_matrix = csr_matrix((data,idx),(num_items,num_items)) + row = y[:, 0] + col = y[:, 1] + data = y[:, 2] + idx = np.array([row, col], dtype='int32') - offset + self.similarity_matrix = csr_matrix((data, idx), (num_items, num_items)) - def compute_similarities(self,dataset,j): + def compute_similarities(self, dataset, j): """ Compute pairwise similarity scores between the j-th item and every item in the dataset. @@ -130,7 +128,7 @@ def compute_similarities(self,dataset,j): """ pass - def get_similar_items(self,j,max_similar_items=30,dataset=None): + def get_similar_items(self, j, max_similar_items=30, dataset=None): """ Get the most similar items to a supplied item. @@ -150,16 +148,16 @@ def get_similar_items(self,j,max_similar_items=30,dataset=None): Sorted list of similar items, best first. Each entry is a tuple of the form (i,score). """ - if hasattr(self,'similarity_matrix') and self.similarity_matrix is not None: - w = zip(self.similarity_matrix[j].indices,self.similarity_matrix[j].data) - sims = sorted(w,key=itemgetter(1),reverse=True)[:max_similar_items] - sims = [(i,f) for i,f in sims if f > 0] + if hasattr(self, 'similarity_matrix') and self.similarity_matrix is not None: + w = zip(self.similarity_matrix[j].indices, self.similarity_matrix[j].data) + sims = sorted(w, key=itemgetter(1), reverse=True)[:max_similar_items] + sims = [(i, f) for i, f in sims if f > 0] else: - w = self.compute_similarities(dataset,j) - sims = [(i,w[i]) for i in w.argsort()[-1:-max_similar_items-1:-1] if w[i] > 0] + w = self.compute_similarities(dataset, j) + sims = [(i, w[i]) for i in w.argsort()[-1:-max_similar_items - 1:-1] if w[i] > 0] return sims - def recommend_items(self,dataset,u,max_items=10,return_scores=True,item_features=None): + def recommend_items(self, dataset, u, max_items=10, return_scores=True, item_features=None): """ Recommend new items for a user. Assumes you've already called fit() to learn the similarity matrix. @@ -192,7 +190,7 @@ def recommend_items(self,dataset,u,max_items=10,return_scores=True,item_features for i in r.argsort()[::-1]: if i not in known_items: if return_scores: - recs.append((i,r[i])) + recs.append((i, r[i])) else: recs.append(i) if len(recs) >= max_items: @@ -232,7 +230,8 @@ def batch_recommend_items(self, r = dataset * self.similarity_matrix.T except AttributeError: raise AttributeError('you must call fit() before trying to recommend items') - return self._get_recommendations_from_predictions(r,dataset,0,r.shape[0],max_items,return_scores,show_progress) + return self._get_recommendations_from_predictions(r, dataset, 0, r.shape[0], max_items, return_scores, + show_progress) def range_recommend_items(self, dataset, @@ -267,12 +266,13 @@ def range_recommend_items(self, else just a list of idxs. """ try: - r = dataset[user_start:user_end,:] * self.similarity_matrix.T + r = dataset[user_start:user_end, :] * self.similarity_matrix.T except AttributeError: raise AttributeError('you must call fit() before trying to recommend items') - return self._get_recommendations_from_predictions(r,dataset,user_start,user_end,max_items,return_scores) + return self._get_recommendations_from_predictions(r, dataset, user_start, user_end, max_items, return_scores) - def _get_recommendations_from_predictions(self,r,dataset,user_start,user_end,max_items,return_scores=True,show_progress=False): + def _get_recommendations_from_predictions(self, r, dataset, user_start, user_end, max_items, return_scores=True, + show_progress=False): """ Select recommendations given predicted scores/ratings. @@ -299,17 +299,17 @@ def _get_recommendations_from_predictions(self,r,dataset,user_start,user_end,max Each entry is a list of (idx,score) pairs if return_scores is True, else just a list of idxs. """ - r = self._zero_known_item_scores(r,dataset[user_start:user_end,:]) - recs = [[] for u in xrange(user_start,user_end)] - for u in xrange(user_start,user_end): + r = self._zero_known_item_scores(r, dataset[user_start:user_end, :]) + recs = [[] for u in range(user_start, user_end)] + for u in range(user_start, user_end): ux = u - user_start - if show_progress and ux%1000 == 0: - print ux,'..', - ru = r[ux,:] + if show_progress and ux % 1000 == 0: + print(ux, '..', ) + ru = r[ux, :] if return_scores: - recs[ux] = [(i,v) for v,i in sorted(izip(ru.data,ru.indices),reverse=True) if v > 0][:max_items] + recs[ux] = [(i, v) for v, i in sorted(zip(ru.data, ru.indices), reverse=True) if v > 0][:max_items] else: - recs[ux] = [i for v,i in sorted(izip(ru.data,ru.indices),reverse=True) if v > 0][:max_items] + recs[ux] = [i for v, i in sorted(zip(ru.data, ru.indices), reverse=True) if v > 0][:max_items] if show_progress: - print + print() return recs diff --git a/mrec/item_similarity/slim.py b/mrec/item_similarity/slim.py index 2cf698a..ccd426e 100644 --- a/mrec/item_similarity/slim.py +++ b/mrec/item_similarity/slim.py @@ -11,12 +11,11 @@ http://glaros.dtc.umn.edu/gkhome/fetch/papers/SLIM2011icdm.pdf """ -from sklearn.linear_model import SGDRegressor, ElasticNet -from sklearn.preprocessing import binarize -import sklearn import numpy as np +import sklearn +from sklearn.linear_model import SGDRegressor, ElasticNet -from recommender import ItemSimilarityRecommender +from mrec.item_similarity.recommender import ItemSimilarityRecommender def parse_version(version_string): @@ -30,16 +29,16 @@ class NNFeatureSelectingSGDRegressor(object): Wraps nearest-neighbour feature selection and regression in a single model. """ - def __init__(self,model,k): + def __init__(self, model, k): self.model = model self.k = k - def fit(self,A,a): + def fit(self, A, a): # find k-NN by brute force d = A.T.dot(a).flatten() # distance = dot product - nn = d.argsort()[-1:-1-self.k:-1] + nn = d.argsort()[-1:-1 - self.k:-1] # fit the model to selected features only - self.model.fit(A[:,nn],a) + self.model.fit(A[:, nn], a) # set our weights for the selected "features" i.e. items self.coef_ = np.zeros(A.shape[1]) self.coef_[nn] = self.model.coef_ @@ -47,6 +46,7 @@ def fit(self,A,a): def __str__(self): return 'NN-feature selecting {0}'.format(self.model) + class SLIM(ItemSimilarityRecommender): """ Parameters @@ -68,6 +68,7 @@ class SLIM(ItemSimilarityRecommender): :elasticnet: ElasticNet :fs_sgd: NNFeatureSelectingSGDRegressor """ + def __init__(self, l1_reg=0.001, l2_reg=0.0001, @@ -75,39 +76,40 @@ def __init__(self, ignore_negative_weights=False, num_selected_features=200, model='sgd'): - alpha = l1_reg+l2_reg - l1_ratio = l1_reg/alpha + alpha = l1_reg + l2_reg + l1_ratio = l1_reg / alpha if parse_version(sklearn.__version__) <= (0, 14, 1): # Backward compat: in old versions of scikit-learn l1_ratio had # the opposite sign... l1_ratio = (1 - l1_ratio) if model == 'sgd': - self.model = SGDRegressor(penalty='elasticnet',fit_intercept=fit_intercept,alpha=alpha,l1_ratio=l1_ratio) + self.model = SGDRegressor(penalty='elasticnet', fit_intercept=fit_intercept, alpha=alpha, l1_ratio=l1_ratio) elif model == 'elasticnet': - self.model = ElasticNet(alpha=alpha,l1_ratio=l1_ratio,positive=True,fit_intercept=fit_intercept,copy_X=False) + self.model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, positive=True, fit_intercept=fit_intercept, + copy_X=False) elif model == 'fs_sgd': - m = SGDRegressor(penalty='elasticnet',fit_intercept=fit_intercept,alpha=alpha,l1_ratio=l1_ratio) - self.model = NNFeatureSelectingSGDRegressor(m,num_selected_features) + m = SGDRegressor(penalty='elasticnet', fit_intercept=fit_intercept, alpha=alpha, l1_ratio=l1_ratio) + self.model = NNFeatureSelectingSGDRegressor(m, num_selected_features) else: raise SystemExit('unknown model type: {0}'.format(model)) self.ignore_negative_weights = ignore_negative_weights - def compute_similarities(self,dataset,j): + def compute_similarities(self, dataset, j): """Compute item similarity weights for item j.""" # zero out the j-th column of the input so we get w[j] = 0 a = dataset.fast_get_col(j) - dataset.fast_update_col(j,np.zeros(a.nnz)) - self.model.fit(dataset.X,a.toarray().ravel()) + dataset.fast_update_col(j, np.zeros(a.nnz)) + self.model.fit(dataset.X, a.toarray().ravel()) # reinstate the j-th column - dataset.fast_update_col(j,a.data) + dataset.fast_update_col(j, a.data) w = self.model.coef_ if self.ignore_negative_weights: - w[w<0] = 0 + w[w < 0] = 0 return w - def compute_similarities_from_vec(self,dataset,a): + def compute_similarities_from_vec(self, dataset, a): """Compute item similarity weights for out-of-dataset item vector.""" - self.model.fit(dataset.X,a) + self.model.fit(dataset.X, a) return self.model.coef_ def __str__(self): @@ -116,6 +118,7 @@ def __str__(self): else: return 'SLIM({0})'.format(self.model) + if __name__ == '__main__': # use SLIM like this: @@ -126,7 +129,7 @@ def __str__(self): random.seed(0) - print 'loading test data...' + print('loading test data...') data = """\ %%MatrixMarket matrix coordinate real general 3 5 9 @@ -140,44 +143,44 @@ def __str__(self): 3 3 1 3 4 1 """ - print data - dataset = load_fast_sparse_matrix('mm',StringIO.StringIO(data)) - num_users,num_items = dataset.shape + print(data) + dataset = load_fast_sparse_matrix('mm', StringIO.StringIO(data)) + num_users, num_items = dataset.shape model = SLIM() num_samples = 2 - def output(i,j,val): + def output(i, j, val): # convert back to 1-indexed - print '{0}\t{1}\t{2:.3f}'.format(i+1,j+1,val) + print('{0}\t{1}\t{2:.3f}'.format(i + 1, j + 1, val)) - print 'computing some item similarities...' - print 'item\tsim\tweight' + print('computing some item similarities...') + print('item\tsim\tweight') # if we want we can compute these individually without calling fit() - for i in random.sample(xrange(num_items),num_samples): - for j,weight in model.get_similar_items(i,max_similar_items=10,dataset=dataset): - output(i,j,weight) + for i in random.sample(range(num_items), num_samples): + for j, weight in model.get_similar_items(i, max_similar_items=10, dataset=dataset): + output(i, j, weight) - print 'learning entire similarity matrix...' + print('learning entire similarity matrix...') # usually we'll call train() on the entire dataset model = SLIM() model.fit(dataset) - print 'making some recommendations...' - print 'user\trec\tscore' - for u in random.sample(xrange(num_users),num_samples): - for i,score in model.recommend_items(dataset.X,u,max_items=10): - output(u,i,score) + print('making some recommendations...') + print('user\trec\tscore') + for u in random.sample(range(num_users), num_samples): + for i, score in model.recommend_items(dataset.X, u, max_items=10): + output(u, i, score) - print 'making batch recommendations...' + print('making batch recommendations...') recs = model.batch_recommend_items(dataset.X) - for u in xrange(num_users): - for i,score in recs[u]: - output(u,i,score) - - print 'making range recommendations...' - for start,end in [(0,2),(2,3)]: - recs = model.range_recommend_items(dataset.X,start,end) - for u in xrange(start,end): - for i,score in recs[u-start]: - output(u,i,score) + for u in range(num_users): + for i, score in recs[u]: + output(u, i, score) + + print('making range recommendations...') + for start, end in [(0, 2), (2, 3)]: + recs = model.range_recommend_items(dataset.X, start, end) + for u in range(start, end): + for i, score in recs[u - start]: + output(u, i, score) diff --git a/mrec/mf/climf.py b/mrec/mf/climf.py index 61ba395..8729cb9 100644 --- a/mrec/mf/climf.py +++ b/mrec/mf/climf.py @@ -11,7 +11,7 @@ """ from math import exp, log -import random + import numpy as np from mrec.mf.recommender import MatrixFactorizationRecommender @@ -22,32 +22,33 @@ def g(x): """sigmoid function""" - return 1/(1+exp(-x)) + return 1 / (1 + exp(-x)) + def dg(x): """derivative of sigmoid function""" - return exp(x)/(1+exp(x))**2 + return exp(x) / (1 + exp(x)) ** 2 -class CLiMFRecommender(MatrixFactorizationRecommender): - def __init__(self,d,lbda=0.01,gamma=0.01,max_iters=25): +class CLiMFRecommender(MatrixFactorizationRecommender): + def __init__(self, d, lbda=0.01, gamma=0.01, max_iters=25): self.d = d self.lbda = lbda self.gamma = gamma self.max_iters = max_iters - def fit(self,data): - self.U = 0.01*np.random.random_sample((data.shape[0],self.d)) - self.V = 0.01*np.random.random_sample((data.shape[1],self.d)) + def fit(self, data): + self.U = 0.01 * np.random.random_sample((data.shape[0], self.d)) + self.V = 0.01 * np.random.random_sample((data.shape[1], self.d)) # TODO: create a validation set - for iter in xrange(self.max_iters): - print 'iteration {0}:'.format(iter+1) - print 'objective = {0:.4f}'.format(self.objective(data)) + for some_iter in range(self.max_iters): + print('iteration {0}:'.format(some_iter + 1)) + print('objective = {0:.4f}'.format(self.objective(data))) self.update(data) # TODO: compute MRR on validation set, terminate if appropriate - def precompute_f(self,data,i): + def precompute_f(self, data, i): """ precompute f[j] = @@ -61,10 +62,10 @@ def precompute_f(self,data,i): dot products for all j in data[i] """ items = data[i].indices - f = dict((j,np.dot(self.U[i],self.V[j])) for j in items) + f = dict((j, np.dot(self.U[i], self.V[j])) for j in items) return f - def objective(self,data): + def objective(self, data): """ compute objective function F(U,V) @@ -76,16 +77,16 @@ def objective(self,data): returns: current value of F(U,V) """ - F = -0.5*self.lbda*(np.sum(self.U*self.U)+np.sum(self.V*self.V)) - for i in xrange(len(self.U)): - f = self.precompute_f(data,i) + F = -0.5 * self.lbda * (np.sum(self.U * self.U) + np.sum(self.V * self.V)) + for i in range(len(self.U)): + f = self.precompute_f(data, i) for j in f: F += log(g(f[j])) for k in f: - F += log(1-g(f[k]-f[j])) + F += log(1 - g(f[k] - f[j])) return F - def update(self,data): + def update(self, data): """ update user/item factors using stochastic gradient ascent @@ -96,20 +97,20 @@ def update(self,data): lbda : regularization constant lambda gamma: learning rate """ - for i in xrange(len(self.U)): - dU = -self.lbda*self.U[i] - f = self.precompute_f(data,i) + for i in range(len(self.U)): + dU = -self.lbda * self.U[i] + f = self.precompute_f(data, i) for j in f: - dV = g(-f[j])-self.lbda*self.V[j] + dV = g(-f[j]) - self.lbda * self.V[j] for k in f: - dV += dg(f[j]-f[k])*(1/(1-g(f[k]-f[j]))-1/(1-g(f[j]-f[k])))*self.U[i] - self.V[j] += self.gamma*dV - dU += g(-f[j])*self.V[j] + dV += dg(f[j] - f[k]) * (1 / (1 - g(f[k] - f[j])) - 1 / (1 - g(f[j] - f[k]))) * self.U[i] + self.V[j] += self.gamma * dV + dU += g(-f[j]) * self.V[j] for k in f: - dU += (self.V[j]-self.V[k])*dg(f[k]-f[j])/(1-g(f[k]-f[j])) - self.U[i] += self.gamma*dU + dU += (self.V[j] - self.V[k]) * dg(f[k] - f[j]) / (1 - g(f[k] - f[j])) + self.U[i] += self.gamma * dU - def compute_mrr(self,data,test_users=None): + def compute_mrr(self, data, test_users=None): """ compute average Mean Reciprocal Rank of data according to factors @@ -125,23 +126,24 @@ def compute_mrr(self,data,test_users=None): mrr = [] if test_users is None: test_users = range(len(self.U)) - for ix,i in enumerate(test_users): + for ix, i in enumerate(test_users): items = set(data[i].indices) if not items: continue - predictions = np.sum(np.tile(self.U[i],(len(self.V),1))*self.V,axis=1) + predictions = np.sum(np.tile(self.U[i], (len(self.V), 1)) * self.V, axis=1) found = False - for rank,item in enumerate(np.argsort(predictions)[::-1]): + for rank, item in enumerate(np.argsort(predictions)[::-1]): if item in items: - mrr.append(1.0/(rank+1)) + mrr.append(1.0 / (rank + 1)) found = True break if not found: - print 'fail, no relevant items predicted for test user {0}'.format(i+1) - print 'known items: {0}'.format(items) - assert(len(mrr) == len(test_users)) + print('fail, no relevant items predicted for test user {0}'.format(i + 1)) + print('known items: {0}'.format(items)) + assert (len(mrr) == len(test_users)) return np.mean(mrr) + def main(): import sys from mrec import load_sparse_matrix, save_recommender @@ -152,13 +154,15 @@ def main(): outfile = sys.argv[3] # load training set as scipy sparse matrix - train = load_sparse_matrix(file_format,filepath) + train = load_sparse_matrix(file_format, filepath) model = CLiMFRecommender(d=5) model.fit(train) - save_recommender(model,outfile) + save_recommender(model, outfile) + if __name__ == '__main__': import cProfile + cProfile.run('main()') diff --git a/mrec/mf/evaluate.py b/mrec/mf/evaluate.py index 02c0794..c22ef63 100644 --- a/mrec/mf/evaluate.py +++ b/mrec/mf/evaluate.py @@ -1,29 +1,30 @@ -def retrain_recommender(model,dataset): +def retrain_recommender(model, dataset): model.fit(dataset.X) + if __name__ == '__main__': - try: - from sklearn.grid_search import ParameterGrid - except ImportError: - from sklearn.grid_search import IterGrid as ParameterGrid + from sklearn.model_selection import ParameterGrid from optparse import OptionParser - from warp import WARPMFRecommender + from mrec.mf.warp import WARPMFRecommender from mrec.evaluation.metrics import * parser = OptionParser() - parser.add_option('-m','--main_split_dir',dest='main_split_dir',help='directory containing 50/50 splits for main evaluation') - parser.add_option('-l','--loo_split_dir',dest='loo_split_dir',help='directory containing LOO splits for hit rate evaluation') - parser.add_option('-n','--num_splits',dest='num_splits',type='int',default=5,help='number of splits in each directory (default: %default)') - - (opts,args) = parser.parse_args() + parser.add_option('-m', '--main_split_dir', dest='main_split_dir', + help='directory containing 50/50 splits for main evaluation') + parser.add_option('-l', '--loo_split_dir', dest='loo_split_dir', + help='directory containing LOO splits for hit rate evaluation') + parser.add_option('-n', '--num_splits', dest='num_splits', type='int', default=5, + help='number of splits in each directory (default: %default)') + + (opts, args) = parser.parse_args() if not (opts.main_split_dir or opts.loo_split_dir) or not opts.num_splits: parser.print_help() raise SystemExit - print 'doing a grid search for regularization parameters...' - params = {'d':[100],'gamma':[0.01],'C':[100],'max_iter':[100000],'validation_iters':[500]} + print('doing a grid search for regularization parameters...') + params = {'d': [100], 'gamma': [0.01], 'C': [100], 'max_iter': [100000], 'validation_iters': [500]} models = [WARPMFRecommender(**a) for a in ParameterGrid(params)] for train in glob: @@ -33,19 +34,19 @@ def retrain_recommender(model,dataset): # test is a dict id->[id,id,...] if opts.main_split_dir: - generate_main_metrics = generate_metrics(get_known_items_from_dict,compute_main_metrics) + generate_main_metrics = generate_metrics(get_known_items_from_dict, compute_main_metrics) main_metrics = run_evaluation(models, retrain_recommender, - load_splits(opts.main_split_dir,opts.num_splits), + load_splits(opts.main_split_dir, opts.num_splits), opts.num_splits, generate_main_metrics) - print_report(models,main_metrics) + print_report(models, main_metrics) if opts.loo_split_dir: - generate_hit_rate = generate_metrics(get_known_items_from_dict,compute_hit_rate) + generate_hit_rate = generate_metrics(get_known_items_from_dict, compute_hit_rate) hit_rate_metrics = run_evaluation(models, retrain_recommender, - load_splits(opts.loo_split_dir,opts.num_splits), + load_splits(opts.loo_split_dir, opts.num_splits), opts.num_splits, generate_hit_rate) - print_report(models,hit_rate_metrics) + print_report(models, hit_rate_metrics) diff --git a/mrec/mf/model/warp.py b/mrec/mf/model/warp.py index 0465343..bf2d99f 100644 --- a/mrec/mf/model/warp.py +++ b/mrec/mf/model/warp.py @@ -1,27 +1,25 @@ import numpy as np -import random -from itertools import izip +from warp_fast import warp_sample, apply_updates from mrec.evaluation import metrics -from warp_fast import warp_sample, apply_updates class WARPBatchUpdate(object): """Collection of arrays to hold a batch of WARP sgd updates.""" - def __init__(self,batch_size,d): - self.u = np.zeros(batch_size,dtype='int32') - self.dU = np.zeros((batch_size,d),order='F') - self.v_pos = np.zeros(batch_size,dtype='int32') - self.dV_pos = np.zeros((batch_size,d)) - self.v_neg = np.zeros(batch_size,dtype='int32') - self.dV_neg = np.zeros((batch_size,d)) + def __init__(self, batch_size, d): + self.u = np.zeros(batch_size, dtype='int32') + self.dU = np.zeros((batch_size, d), order='F') + self.v_pos = np.zeros(batch_size, dtype='int32') + self.dV_pos = np.zeros((batch_size, d)) + self.v_neg = np.zeros(batch_size, dtype='int32') + self.dV_neg = np.zeros((batch_size, d)) def clear(self): pass - def set_update(self,ix,update): - u,v_pos,v_neg,dU,dV_pos,dV_neg = update + def set_update(self, ix, update): + u, v_pos, v_neg, dU, dV_pos, dV_neg = update self.u[ix] = u self.dU[ix] = dU self.v_pos[ix] = v_pos @@ -29,6 +27,7 @@ def set_update(self,ix,update): self.v_neg[ix] = v_neg self.dV_neg[ix] = dV_neg + class WARPDecomposition(object): """ Matrix embedding optimizing the WARP loss. @@ -43,14 +42,14 @@ class WARPDecomposition(object): The embedding dimension for the decomposition. """ - def __init__(self,num_rows,num_cols,d): + def __init__(self, num_rows, num_cols, d): # initialize factors to small random values - self.U = d**-0.5*np.random.random_sample((num_rows,d)) - self.V = d**-0.5*np.random.random_sample((num_cols,d)) + self.U = d ** -0.5 * np.random.random_sample((num_rows, d)) + self.V = d ** -0.5 * np.random.random_sample((num_cols, d)) # ensure memory layout avoids extra allocation in dot product self.U = np.asfortranarray(self.U) - def compute_gradient_step(self,u,i,j,L): + def compute_gradient_step(self, u, i, j, L): """ Compute a gradient step from results of sampling. @@ -81,24 +80,25 @@ def compute_gradient_step(self,u,i,j,L): dV_neg : numpy.ndarray Gradient step for V[j]. """ - dU = L*(self.V[i]-self.V[j]) - dV_pos = L*self.U[u] - dV_neg = -L*self.U[u] - return u,i,j,dU,dV_pos,dV_neg + dU = L * (self.V[i] - self.V[j]) + dV_pos = L * self.U[u] + dV_neg = -L * self.U[u] + return u, i, j, dU, dV_pos, dV_neg - def apply_updates(self,updates,gamma,C): + def apply_updates(self, updates, gamma, C): # delegate to cython implementation - apply_updates(self.U,updates.u,updates.dU,gamma,C) - apply_updates(self.V,updates.v_pos,updates.dV_pos,gamma,C) - apply_updates(self.V,updates.v_neg,updates.dV_neg,gamma,C) + apply_updates(self.U, updates.u, updates.dU, gamma, C) + apply_updates(self.V, updates.v_pos, updates.dV_pos, gamma, C) + apply_updates(self.V, updates.v_neg, updates.dV_neg, gamma, C) - def reconstruct(self,rows): + def reconstruct(self, rows): if rows is None: U = self.U else: - U = np.asfortranarray(self.U[rows,:]) + U = np.asfortranarray(self.U[rows, :]) return U.dot(self.V.T) + class WARP(object): """ Learn low-dimensional embedding optimizing the WARP loss. @@ -151,9 +151,11 @@ def __init__(self, self.max_trials = max_trials def __str__(self): - return 'WARP(d={0},gamma={1},C={2},max_iters={3},validation_iters={4},batch_size={5},positive_thresh={6},max_trials={7})'.format(self.d,self.gamma,self.C,self.max_iters,self.validation_iters,self.batch_size,self.positive_thresh,self.max_trials) + return 'WARP(d={0},gamma={1},C={2},max_iters={3},validation_iters={4},batch_size={5},positive_thresh={6},max_trials={7})'.format( + self.d, self.gamma, self.C, self.max_iters, self.validation_iters, self.batch_size, self.positive_thresh, + self.max_trials) - def fit(self,train,validation=None): + def fit(self, train, validation=None): """ Learn factors from training set. The dot product of the factors reconstructs the training matrix approximately, minimizing the @@ -175,56 +177,56 @@ def fit(self,train,validation=None): self : object This model itself. """ - num_rows,num_cols = train.shape - decomposition = WARPDecomposition(num_rows,num_cols,self.d) - updates = WARPBatchUpdate(self.batch_size,self.d) + num_rows, num_cols = train.shape + decomposition = WARPDecomposition(num_rows, num_cols, self.d) + updates = WARPBatchUpdate(self.batch_size, self.d) self.precompute_warp_loss(num_cols) - self._fit(decomposition,updates,train,validation) + self._fit(decomposition, updates, train, validation) self.U_ = decomposition.U self.V_ = decomposition.V return self - def _fit(self,decomposition,updates,train,validation): + def _fit(self, decomposition, updates, train, validation): precs = [] tot_trials = 0 - for it in xrange(self.max_iters): + for it in range(self.max_iters): if it % self.validation_iters == 0: - print 'tot_trials',tot_trials + print('tot_trials', tot_trials) tot_trials = 0 - prec = self.estimate_precision(decomposition,train,validation) + prec = self.estimate_precision(decomposition, train, validation) precs.append(prec) - print '{0}: validation precision = {1:.3f}'.format(it,precs[-1]) + print('{0}: validation precision = {1:.3f}'.format(it, precs[-1])) if len(precs) > 3 and precs[-1] < precs[-2] and precs[-2] < precs[-3]: - print 'validation precision got worse twice, terminating' + print('validation precision got worse twice, terminating') break - tot_trials += self.compute_updates(train,decomposition,updates) - decomposition.apply_updates(updates,self.gamma,self.C) + tot_trials += self.compute_updates(train, decomposition, updates) + decomposition.apply_updates(updates, self.gamma, self.C) - def precompute_warp_loss(self,num_cols): + def precompute_warp_loss(self, num_cols): """ Precompute WARP loss for each possible rank: L(i) = \sum_{0,i}{1/(i+1)} """ - assert(num_cols>1) + assert (num_cols > 1) self.warp_loss = np.ones(num_cols) - for i in xrange(1,num_cols): - self.warp_loss[i] = self.warp_loss[i-1]+1.0/(i+1) + for i in range(1, num_cols): + self.warp_loss[i] = self.warp_loss[i - 1] + 1.0 / (i + 1) - def compute_updates(self,train,decomposition,updates): + def compute_updates(self, train, decomposition, updates): updates.clear() tot_trials = 0 - for ix in xrange(self.batch_size): - u,i,j,N,trials = self.sample(train,decomposition) + for ix in range(self.batch_size): + u, i, j, N, trials = self.sample(train, decomposition) tot_trials += trials - L = self.estimate_warp_loss(train,u,N) - updates.set_update(ix,decomposition.compute_gradient_step(u,i,j,L)) + L = self.estimate_warp_loss(train, u, N) + updates.set_update(ix, decomposition.compute_gradient_step(u, i, j, L)) return tot_trials - def sample(self,train,decomposition): + def sample(self, train, decomposition): # delegate to cython implementation return warp_sample(decomposition.U, decomposition.V, @@ -234,13 +236,13 @@ def sample(self,train,decomposition): self.positive_thresh, self.max_trials) - def estimate_warp_loss(self,train,u,N): + def estimate_warp_loss(self, train, u, N): num_cols = train.shape[1] - nnz = train.indptr[u+1]-train.indptr[u] - estimated_rank = (num_cols-nnz-1)/N + nnz = train.indptr[u + 1] - train.indptr[u] + estimated_rank = (num_cols - nnz - 1) / N return self.warp_loss[estimated_rank] - def estimate_precision(self,decomposition,train,validation,k=30): + def estimate_precision(self, decomposition, train, validation, k=30): """ Compute prec@k for a sample of training rows. @@ -269,10 +271,10 @@ def estimate_precision(self,decomposition,train,validation,k=30): recommendations because we do not exclude training cols with zero ratings from the top-k predictions evaluated. """ - if isinstance(validation,dict): + if isinstance(validation, dict): have_validation_set = True rows = validation.keys() - elif isinstance(validation,(int,long)): + elif isinstance(validation, (int, long)): have_validation_set = False rows = range(validation) else: @@ -280,12 +282,11 @@ def estimate_precision(self,decomposition,train,validation,k=30): r = decomposition.reconstruct(rows) prec = 0 - for u,ru in izip(rows,r): + for u, ru in zip(rows, r): predicted = ru.argsort()[::-1][:k] if have_validation_set: actual = validation[u] else: actual = train[u].indices[train[u].data > 0] - prec += metrics.prec(predicted,actual,k) - return float(prec)/len(rows) - + prec += metrics.prec(predicted, actual, k) + return float(prec) / len(rows) diff --git a/mrec/mf/model/warp2.py b/mrec/mf/model/warp2.py index 66a5925..18b7417 100644 --- a/mrec/mf/model/warp2.py +++ b/mrec/mf/model/warp2.py @@ -1,25 +1,26 @@ import numpy as np import scipy -import random - -from warp import WARPBatchUpdate, WARPDecomposition, WARP from warp_fast import warp2_sample +from mrec.mf.model.warp import WARPBatchUpdate, WARPDecomposition, WARP + + class WARP2BatchUpdate(WARPBatchUpdate): """Collection of arrays to hold a batch of sgd updates.""" - def __init__(self,batch_size,num_features,d): - WARPBatchUpdate.__init__(self,batch_size,d) - self.dW = np.zeros((num_features,d)) + def __init__(self, batch_size, num_features, d): + WARPBatchUpdate.__init__(self, batch_size, d) + self.dW = np.zeros((num_features, d)) def clear(self): self.dW[:] = 0 - def set_update(self,ix,update): - u,v_pos,v_neg,dU,dV_pos,dV_neg,dW = update - WARPBatchUpdate.set_update(self,ix,(u,v_pos,v_neg,dU,dV_pos,dV_neg)) + def set_update(self, ix, update): + u, v_pos, v_neg, dU, dV_pos, dV_neg, dW = update + WARPBatchUpdate.set_update(self, ix, (u, v_pos, v_neg, dU, dV_pos, dV_neg)) self.dW += dW + class WARP2Decomposition(WARPDecomposition): """ Joint matrix and feature embedding optimizing the WARP loss. @@ -36,14 +37,14 @@ class WARP2Decomposition(WARPDecomposition): The embedding dimension. """ - def __init__(self,num_rows,num_cols,X,d): - WARPDecomposition.__init__(self,num_rows,num_cols,d) + def __init__(self, num_rows, num_cols, X, d): + WARPDecomposition.__init__(self, num_rows, num_cols, d) # W holds latent factors for each item feature - self.W = d**-0.5*np.random.random_sample((X.shape[1],d)) + self.W = d ** -0.5 * np.random.random_sample((X.shape[1], d)) self.X = X - self.is_sparse = isinstance(X,scipy.sparse.csr_matrix) + self.is_sparse = isinstance(X, scipy.sparse.csr_matrix) - def compute_gradient_step(self,u,i,j,L): + def compute_gradient_step(self, u, i, j, L): """ Compute a gradient step from results of sampling. @@ -76,33 +77,34 @@ def compute_gradient_step(self,u,i,j,L): dW : numpy.ndarray Gradient step for W. """ - dU = L*(self.V[i]-self.V[j]) - dV_pos = L*self.U[u] - dV_neg = -L*self.U[u] - dx = self.X[i]-self.X[j] + dU = L * (self.V[i] - self.V[j]) + dV_pos = L * self.U[u] + dV_neg = -L * self.U[u] + dx = self.X[i] - self.X[j] if not self.is_sparse: dx = np.atleast_2d(dx) - dW = L*dx.T.dot(np.atleast_2d(self.U[u])) - return u,i,j,dU,dV_pos,dV_neg,dW + dW = L * dx.T.dot(np.atleast_2d(self.U[u])) + return u, i, j, dU, dV_pos, dV_neg, dW - def apply_updates(self,updates,gamma,C): - WARPDecomposition.apply_updates(self,updates,gamma,C) - self.apply_matrix_update(self.W,updates.dW,gamma,C) + def apply_updates(self, updates, gamma, C): + WARPDecomposition.apply_updates(self, updates, gamma, C) + self.apply_matrix_update(self.W, updates.dW, gamma, C) - def apply_matrix_update(self,W,dW,gamma,C): - W += gamma*dW + def apply_matrix_update(self, W, dW, gamma, C): + W += gamma * dW # ensure that ||W_k|| < C for all k - p = np.sum(np.abs(W)**2,axis=-1)**0.5/C - p[p<1] = 1 - W /= p[:,np.newaxis] + p = np.sum(np.abs(W) ** 2, axis=-1) ** 0.5 / C + p[p < 1] = 1 + W /= p[:, np.newaxis] - def reconstruct(self,rows): + def reconstruct(self, rows): if rows is None: U = self.U else: - U = np.asfortranarray(self.U[rows,:]) + U = np.asfortranarray(self.U[rows, :]) return U.dot(self.V.T + self.X.dot(self.W).T) + class WARP2(WARP): """ Learn low-dimensional embedding optimizing the WARP loss. @@ -138,7 +140,7 @@ class WARP2(WARP): Item feature factors. """ - def fit(self,train,X,validation=None): + def fit(self, train, X, validation=None): """ Learn embedding from training set. A suitable dot product of the factors reconstructs the training matrix approximately, minimizing @@ -162,12 +164,12 @@ def fit(self,train,X,validation=None): self : object This model itself. """ - num_rows,num_cols = train.shape - decomposition = WARP2Decomposition(num_rows,num_cols,X,self.d) - updates = WARP2BatchUpdate(self.batch_size,X.shape[1],self.d) + num_rows, num_cols = train.shape + decomposition = WARP2Decomposition(num_rows, num_cols, X, self.d) + updates = WARP2BatchUpdate(self.batch_size, X.shape[1], self.d) self.precompute_warp_loss(num_cols) - self._fit(decomposition,updates,train,validation) + self._fit(decomposition, updates, train, validation) self.U_ = decomposition.U self.V_ = decomposition.V @@ -175,7 +177,7 @@ def fit(self,train,X,validation=None): return self - def sample(self,train,decomposition): + def sample(self, train, decomposition): # delegate to cython implementation return warp2_sample(decomposition.U, decomposition.V, @@ -186,4 +188,3 @@ def sample(self,train,decomposition): train.indptr, self.positive_thresh, self.max_trials) - diff --git a/mrec/mf/model/warp_fast.pyx b/mrec/mf/model/warp_fast.pyx index e4b8417..c65569d 100644 --- a/mrec/mf/model/warp_fast.pyx +++ b/mrec/mf/model/warp_fast.pyx @@ -121,7 +121,7 @@ cdef sample_violating_negative_example(np.ndarray[np.float_t,ndim=2] U, num_items = V.shape[0] r = U[u].dot(V[i]) - for N in xrange(1,max_trials): + for N in range(1,max_trials): # find j!=i s.t. data[u,j] < data[u,i] j = sample_negative_example(num_items,vals,indices,begin,end,ix) if r - U[u].dot(V[j]) < 1: @@ -166,7 +166,7 @@ cdef sample_negative_example(num_items, # sample item uniformly with replacement j = rand() % num_items found = 0 - for jx in xrange(begin,end): + for jx in range(begin,end): if indices[jx] == j: found = 1 break @@ -242,7 +242,7 @@ def apply_updates(np.ndarray[np.float_t,ndim=2] F, assert(rows.shape[0] == deltas.shape[0]) num = rows.shape[0] - for i in xrange(num): + for i in range(num): row = rows[i] delta = deltas[i] F[row] += gamma*delta @@ -379,7 +379,7 @@ cdef sample_violating_negative_example2(np.ndarray[np.float_t,ndim=2] U, XW = sparse_sdot(xbuf,W,X,i,is_sparse) r = U[u].dot(V[i] + XW) - for N in xrange(1,max_trials): + for N in range(1,max_trials): # find j!=i s.t. data[u,j] < data[u,i] j = sample_negative_example(num_items,vals,indices,begin,end,ix) XW = sparse_sdot(xbuf,W,X,j,is_sparse) @@ -399,10 +399,10 @@ cdef sparse_sdot(np.ndarray[np.float_t,ndim=1] xbuf, if is_sparse: # TODO: surely there's something built in to do this... - for ix in xrange(X.indptr[i],X.indptr[i+1]): + for ix in range(X.indptr[i],X.indptr[i+1]): xbuf[X.indices[ix]] = X.data[ix] XW = xbuf.dot(W) - for ix in xrange(X.indptr[i],X.indptr[i+1]): + for ix in range(X.indptr[i],X.indptr[i+1]): xbuf[X.indices[ix]] = 0 else: XW = X[i].dot(W) diff --git a/mrec/mf/recommender.py b/mrec/mf/recommender.py index f7e422c..13bcfc4 100644 --- a/mrec/mf/recommender.py +++ b/mrec/mf/recommender.py @@ -3,16 +3,13 @@ by matrix factorization. """ -try: - import cPickle as pickle -except ImportError: - import pickle +import pickle + import numpy as np -from itertools import izip -from scipy.sparse import csr_matrix from mrec.base_recommender import BaseRecommender + class MatrixFactorizationRecommender(BaseRecommender): """ Base class for matrix factorization recommenders. @@ -30,13 +27,13 @@ def _create_archive(self): """ # pickle the model without its factors # then use numpy to save the factors efficiently - tmp = (self.U,self.V) + tmp = (self.U, self.V) self.U = self.V = None m = pickle.dumps(self) - self.U,self.V = tmp - return {'model':m,'U':self.U,'V':self.V} + self.U, self.V = tmp + return {'model': m, 'U': self.U, 'V': self.V} - def _load_archive(self,archive): + def _load_archive(self, archive): """ Load fields from a numpy archive. """ @@ -44,11 +41,11 @@ def _load_archive(self,archive): self.V = archive['V'] def __str__(self): - if hasattr(self,'description'): + if hasattr(self, 'description'): return self.description return 'MatrixFactorizationRecommender' - def fit(self,train): + def fit(self, train): """ Learn user and item factors from training dataset. @@ -59,7 +56,7 @@ def fit(self,train): """ pass - def load_factors(self,user_factor_filepath,item_factor_filepath,fmt): + def load_factors(self, user_factor_filepath, item_factor_filepath, fmt): """ Load precomputed user and item factors from file. @@ -89,7 +86,7 @@ def load_factors(self,user_factor_filepath,item_factor_filepath,fmt): # ensure that memory layout avoids extra allocation in dot product self.U = np.asfortranarray(self.U) - def recommend_items(self,dataset,u,max_items=10,return_scores=True,item_features=None): + def recommend_items(self, dataset, u, max_items=10, return_scores=True, item_features=None): """ Recommend up to max_items most highly recommended items for user u. Assumes you've already called fit() to learn the factors. @@ -113,10 +110,10 @@ def recommend_items(self,dataset,u,max_items=10,return_scores=True,item_features List of (idx,score) pairs if return_scores is True, else just a list of idxs. """ - r = self.predict_ratings(u,item_features=item_features) - return self._get_recommendations_from_predictions(r,dataset,u,u+1,max_items,return_scores)[0] + r = self.predict_ratings(u, item_features=item_features) + return self._get_recommendations_from_predictions(r, dataset, u, u + 1, max_items, return_scores)[0] - def predict_ratings(self,users=None,item_features=None): + def predict_ratings(self, users=None, item_features=None): """ Predict ratings/scores for all items for supplied users. Assumes you've already called fit() to learn the factors. @@ -137,13 +134,13 @@ def predict_ratings(self,users=None,item_features=None): predictions : numpy.ndarray, shape = [len(users), num_items] Predicted ratings for all items for each supplied user. """ - if isinstance(users,int): + if isinstance(users, int): users = [users] if users is None: U = self.U else: - U = np.asfortranarray(self.U[users,:]) + U = np.asfortranarray(self.U[users, :]) return U.dot(self.V.T) def batch_recommend_items(self, @@ -176,7 +173,8 @@ def batch_recommend_items(self, else just a list of idxs. """ r = self.predict_ratings(item_features=item_features) - return self._get_recommendations_from_predictions(r,dataset,0,r.shape[0],max_items,return_scores,show_progress) + return self._get_recommendations_from_predictions(r, dataset, 0, r.shape[0], max_items, return_scores, + show_progress) def range_recommend_items(self, dataset, @@ -210,8 +208,8 @@ def range_recommend_items(self, Each entry is a list of (idx,score) pairs if return_scores is True, else just a list of idxs. """ - r = self.predict_ratings(xrange(user_start,user_end),item_features=item_features) - return self._get_recommendations_from_predictions(r,dataset,user_start,user_end,max_items,return_scores) + r = self.predict_ratings(range(user_start, user_end), item_features=item_features) + return self._get_recommendations_from_predictions(r, dataset, user_start, user_end, max_items, return_scores) def _get_recommendations_from_predictions(self, r, @@ -247,17 +245,17 @@ def _get_recommendations_from_predictions(self, Each entry is a list of (idx,score) pairs if return_scores is True, else just a list of idxs. """ - r = np.array(self._zero_known_item_scores(r,dataset[user_start:user_end,:])) - recs = [[] for u in xrange(user_start,user_end)] - for u in xrange(user_start,user_end): + r = np.array(self._zero_known_item_scores(r, dataset[user_start:user_end, :])) + recs = [[] for u in range(user_start, user_end)] + for u in range(user_start, user_end): ux = u - user_start - if show_progress and ux%1000 == 0: - print ux,'..', + if show_progress and ux % 1000 == 0: + print(ux, '..', ) ru = r[ux] if return_scores: - recs[ux] = [(i,ru[i]) for i in ru.argsort()[::-1] if ru[i] > 0][:max_items] + recs[ux] = [(i, ru[i]) for i in ru.argsort()[::-1] if ru[i] > 0][:max_items] else: recs[ux] = [i for i in ru.argsort()[::-1] if ru[i] > 0][:max_items] if show_progress: - print + print() return recs diff --git a/mrec/mf/warp.py b/mrec/mf/warp.py index 94b0346..dfbbfe4 100644 --- a/mrec/mf/warp.py +++ b/mrec/mf/warp.py @@ -1,10 +1,10 @@ -import numpy as np import random -from mrec.evaluation import metrics +import numpy as np + +from mrec.mf.model.warp import WARP +from mrec.mf.recommender import MatrixFactorizationRecommender -from recommender import MatrixFactorizationRecommender -from model.warp import WARP class WARPMFRecommender(MatrixFactorizationRecommender): """ @@ -27,7 +27,7 @@ class WARPMFRecommender(MatrixFactorizationRecommender): In practice it means that we optimize for ranks 1 to max_trials-1. """ - def __init__(self,d,gamma,C,batch_size=10,positive_thresh=0.00001,max_trials=50): + def __init__(self, d, gamma, C, batch_size=10, positive_thresh=0.00001, max_trials=50): self.d = d self.gamma = gamma self.C = C @@ -35,7 +35,7 @@ def __init__(self,d,gamma,C,batch_size=10,positive_thresh=0.00001,max_trials=50) self.positive_thresh = positive_thresh self.max_trials = max_trials - def fit(self,train,item_features=None): + def fit(self, train, item_features=None): """ Learn factors from training set. @@ -46,15 +46,16 @@ def fit(self,train,item_features=None): item_features : array_like, shape = [num_items, num_features] Features for each item in the dataset, ignored here. """ - max_iters,validation_iters,validation = self.create_validation_set(train) - model = WARP(self.d,self.gamma,self.C,max_iters,validation_iters,self.batch_size,self.positive_thresh,self.max_trials) + max_iters, validation_iters, validation = self.create_validation_set(train) + model = WARP(self.d, self.gamma, self.C, max_iters, validation_iters, self.batch_size, self.positive_thresh, + self.max_trials) self.description = 'WARPMF({0})'.format(model) - model.fit(train,validation) + model.fit(train, validation) self.U = model.U_ self.V = model.V_ - def create_validation_set(self,train): + def create_validation_set(self, train): """ Hide and return half of the known items for a sample of users, and estimate the number of sgd iterations to run. @@ -75,42 +76,43 @@ def create_validation_set(self,train): """ # use 1% of users for validation, with a floor num_users = train.shape[0] - num_validation_users = max(num_users/100,100) + num_validation_users = max(num_users / 100, 100) # ensure reasonable expected number of updates per validation user - validation_iters = 100*num_users/num_validation_users + validation_iters = 100 * num_users / num_validation_users # and reasonable number of validation cycles - max_iters = 30*validation_iters + max_iters = 30 * validation_iters - print num_validation_users,'validation users' - print validation_iters,'validation iters' - print max_iters,'max_iters' + print(num_validation_users, 'validation users') + print(validation_iters, 'validation iters') + print(max_iters, 'max_iters') validation = dict() - for u in xrange(num_validation_users): + for u in range(num_validation_users): positive = np.where(train[u].data > 0)[0] - hidden = random.sample(positive,positive.shape[0]/2) + hidden = random.sample(positive, positive.shape[0] / 2) if hidden: train[u].data[hidden] = 0 validation[u] = train[u].indices[hidden] - return max_iters,validation_iters,validation + return max_iters, validation_iters, validation + def main(): import sys from mrec import load_sparse_matrix, save_recommender - from mrec.sparse import fast_sparse_matrix file_format = sys.argv[1] filepath = sys.argv[2] outfile = sys.argv[3] # load training set as scipy sparse matrix - train = load_sparse_matrix(file_format,filepath) + train = load_sparse_matrix(file_format, filepath) - model = WARPMFRecommender(d=100,gamma=0.01,C=100.0,batch_size=10) + model = WARPMFRecommender(d=100, gamma=0.01, C=100.0, batch_size=10) model.fit(train) - save_recommender(model,outfile) + save_recommender(model, outfile) + if __name__ == '__main__': main() diff --git a/mrec/mf/warp2.py b/mrec/mf/warp2.py index 3e4be69..0754eab 100644 --- a/mrec/mf/warp2.py +++ b/mrec/mf/warp2.py @@ -1,7 +1,8 @@ import numpy as np -from warp import WARPMFRecommender -from model.warp2 import WARP2 +from mrec.mf.model.warp2 import WARP2 +from mrec.mf.warp import WARPMFRecommender + class WARP2MFRecommender(WARPMFRecommender): """ @@ -26,9 +27,9 @@ class WARP2MFRecommender(WARPMFRecommender): """ def __str__(self): - return 'WARP2MF(d={0},gamma={1},C={2})'.format(self.d,self.gamma,self.C) + return 'WARP2MF(d={0},gamma={1},C={2})'.format(self.d, self.gamma, self.C) - def fit(self,train,item_features=None): + def fit(self, train, item_features=None): """ Learn factors from training set and item features. @@ -39,16 +40,17 @@ def fit(self,train,item_features=None): item_features : array_like, shape = [num_items, num_features] Features for each item in the dataset. """ - max_iters,validation_iters,validation = self.create_validation_set(train) - model = WARP2(self.d,self.gamma,self.C,max_iters,validation_iters,self.batch_size,self.positive_thresh,self.max_trials) + max_iters, validation_iters, validation = self.create_validation_set(train) + model = WARP2(self.d, self.gamma, self.C, max_iters, validation_iters, self.batch_size, self.positive_thresh, + self.max_trials) self.description = 'WARP2MF({0})'.format(model) - model.fit(train,item_features,validation) + model.fit(train, item_features, validation) self.U = model.U_ self.V = model.V_ self.W = model.W_ - def predict_ratings(self,users=None,item_features=None): + def predict_ratings(self, users=None, item_features=None): """ Predict ratings/scores for all items for supplied users. Assumes you've already called fit() to learn the factors. @@ -69,38 +71,40 @@ def predict_ratings(self,users=None,item_features=None): predictions : numpy.ndarray, shape = [len(users), num_items] Predicted ratings for all items for each supplied user. """ - if isinstance(users,int): + if isinstance(users, int): users = [users] if users is None: U = self.U else: - U = np.asfortranarray(self.U[users,:]) + U = np.asfortranarray(self.U[users, :]) return U.dot(self.V.T + item_features.dot(self.W).T) -def main(file_format,filepath,feature_format,feature_file,outfile): + +def main(file_format, filepath, feature_format, feature_file, outfile): from mrec import load_sparse_matrix, save_recommender - from mrec.sparse import fast_sparse_matrix # load training set - train = load_sparse_matrix(file_format,filepath) + train = load_sparse_matrix(file_format, filepath) # load item features, assume they are tsv: item_id,feature_id,val - X = load_sparse_matrix(feature_format,feature_file).toarray() + X = load_sparse_matrix(feature_format, feature_file).toarray() # strip features for any trailing items that don't appear in training set num_items = train.shape[1] - X = X[:num_items,:] + X = X[:num_items, :] - model = WARP2MFRecommender(d=100,gamma=0.01,C=100.0,batch_size=10) - model.fit(train,X) + model = WARP2MFRecommender(d=100, gamma=0.01, C=100.0, batch_size=10) + model.fit(train, X) + + save_recommender(model, outfile) - save_recommender(model,outfile) if __name__ == '__main__': import sys + file_format = sys.argv[1] filepath = sys.argv[2] feature_format = sys.argv[3] feature_file = sys.argv[4] outfile = sys.argv[5] - main(file_format,filepath,feature_format,feature_file,outfile) + main(file_format, filepath, feature_format, feature_file, outfile) diff --git a/mrec/mf/wrmf.py b/mrec/mf/wrmf.py index 725b05c..5f657d9 100644 --- a/mrec/mf/wrmf.py +++ b/mrec/mf/wrmf.py @@ -11,8 +11,9 @@ import numpy as np from scipy.sparse import csr_matrix -from mrec.sparse import fast_sparse_matrix from mrec.mf.recommender import MatrixFactorizationRecommender +from mrec.sparse import fast_sparse_matrix + class WRMFRecommender(MatrixFactorizationRecommender): """ @@ -28,21 +29,22 @@ class WRMFRecommender(MatrixFactorizationRecommender): Number of iterations of alternating least squares. """ - def __init__(self,d,alpha=1,lbda=0.015,num_iters=15): + def __init__(self, d, alpha=1, lbda=0.015, num_iters=15): self.d = d self.alpha = alpha self.lbda = lbda self.num_iters = num_iters def __str__(self): - return 'WRMFRecommender (d={0},alpha={1},lambda={2},num_iters={3})'.format(self.d,self.alpha,self.lbda,self.num_iters) + return 'WRMFRecommender (d={0},alpha={1},lambda={2},num_iters={3})'.format(self.d, self.alpha, self.lbda, + self.num_iters) - def init_factors(self,num_factors,assign_values=True): + def init_factors(self, num_factors, assign_values=True): if assign_values: - return self.d**-0.5*np.random.random_sample((num_factors,self.d)) - return np.empty((num_factors,self.d)) + return self.d ** -0.5 * np.random.random_sample((num_factors, self.d)) + return np.empty((num_factors, self.d)) - def fit(self,train,item_features=None): + def fit(self, train, item_features=None): """ Learn factors from training set. User and item factors are fitted alternately. @@ -57,42 +59,42 @@ def fit(self,train,item_features=None): if type(train) == csr_matrix: train = fast_sparse_matrix(train) - num_users,num_items = train.shape + num_users, num_items = train.shape - self.U = self.init_factors(num_users,False) # don't need values, will compute them + self.U = self.init_factors(num_users, False) # don't need values, will compute them self.V = self.init_factors(num_items) - for it in xrange(self.num_iters): - print 'iteration',it + for it in range(self.num_iters): + print('iteration', it) # fit user factors VV = self.V.T.dot(self.V) - for u in xrange(num_users): + for u in range(num_users): # get (positive i.e. non-zero scored) items for user indices = train.X[u].nonzero()[1] if indices.size: - self.U[u,:] = self.update(indices,self.V,VV) + self.U[u, :] = self.update(indices, self.V, VV) else: - self.U[u,:] = np.zeros(self.d) + self.U[u, :] = np.zeros(self.d) # fit item factors UU = self.U.T.dot(self.U) - for i in xrange(num_items): + for i in range(num_items): indices = train.fast_get_col(i).nonzero()[0] if indices.size: - self.V[i,:] = self.update(indices,self.U,UU) + self.V[i, :] = self.update(indices, self.U, UU) else: - self.V[i,:] = np.zeros(self.d) + self.V[i, :] = np.zeros(self.d) - def update(self,indices,H,HH): + def update(self, indices, H, HH): """ Update latent factors for a single user or item. """ - Hix = H[indices,:] - M = HH + self.alpha*Hix.T.dot(Hix) + np.diag(self.lbda*np.ones(self.d)) - return np.dot(np.linalg.inv(M),(1+self.alpha)*Hix.sum(axis=0)) + Hix = H[indices, :] + M = HH + self.alpha * Hix.T.dot(Hix) + np.diag(self.lbda * np.ones(self.d)) + return np.dot(np.linalg.inv(M), (1 + self.alpha) * Hix.sum(axis=0)) + def main(): import sys from mrec import load_sparse_matrix, save_recommender - from mrec.sparse import fast_sparse_matrix from mrec.mf.wrmf import WRMFRecommender file_format = sys.argv[1] @@ -100,12 +102,13 @@ def main(): outfile = sys.argv[3] # load training set as scipy sparse matrix - train = load_sparse_matrix(file_format,filepath) + train = load_sparse_matrix(file_format, filepath) model = WRMFRecommender(d=5) model.fit(train) - save_recommender(model,outfile) + save_recommender(model, outfile) + if __name__ == '__main__': main() diff --git a/mrec/parallel/evaluate.py b/mrec/parallel/evaluate.py index 7e0461d..8094fcd 100644 --- a/mrec/parallel/evaluate.py +++ b/mrec/parallel/evaluate.py @@ -2,18 +2,15 @@ Evaluation task to run on an ipython engine. """ -def run(task): +def run(task): # import modules required by engine - import numpy as np - from scipy.sparse import coo_matrix - from collections import defaultdict from mrec import load_sparse_matrix - input_format,testfile,recsfile,start,end,evaluator = task + input_format, testfile, recsfile, start, end, evaluator = task # load the test data - testdata = load_sparse_matrix(input_format,testfile) + testdata = load_sparse_matrix(input_format, testfile) - return evaluator.process(testdata,recsfile,start,end) + return evaluator.process(testdata, recsfile, start, end) diff --git a/mrec/parallel/item_similarity.py b/mrec/parallel/item_similarity.py index 239912e..4c1f452 100644 --- a/mrec/parallel/item_similarity.py +++ b/mrec/parallel/item_similarity.py @@ -1,25 +1,25 @@ -import math import glob -import re +import logging +import math import os +import re import subprocess from shutil import rmtree -import logging from mrec import load_sparse_matrix, save_recommender -class ItemSimilarityRunner(object): - def run(self,view,model,input_format,trainfile,num_engines,simsdir,overwrite,max_sims,simsfile,modelfile): +class ItemSimilarityRunner(object): + def run(self, view, model, input_format, trainfile, num_engines, simsdir, overwrite, max_sims, simsfile, modelfile): logging.info('finding number of items...') - dataset = load_sparse_matrix(input_format,trainfile) - num_users,num_items = dataset.shape + dataset = load_sparse_matrix(input_format, trainfile) + num_users, num_items = dataset.shape del dataset logging.info('%d users and %d items', num_users, num_items) logging.info('creating sims directory {0}...'.format(simsdir)) - subprocess.check_call(['mkdir','-p',simsdir]) + subprocess.check_call(['mkdir', '-p', simsdir]) done = [] if not overwrite: @@ -29,12 +29,12 @@ def run(self,view,model,input_format,trainfile,num_engines,simsdir,overwrite,max logging.info('found {0} output files'.format(len(done))) logging.info('creating tasks...') - tasks = self.create_tasks(model,input_format,trainfile,simsdir,num_items,num_engines,max_sims,done) + tasks = self.create_tasks(model, input_format, trainfile, simsdir, num_items, num_engines, max_sims, done) - if num_engines > 0: + if num_engines > 0 and len(tasks) > 0: logging.info('running %d tasks in parallel across ipython' ' engines...', len(tasks)) - async_job = view.map_async(process,tasks,retries=2) + async_job = view.map_async(process, tasks, retries=2) # wait for tasks to complete results = async_job.get() else: @@ -48,43 +48,44 @@ def run(self,view,model,input_format,trainfile,num_engines,simsdir,overwrite,max if remaining == 0: logging.info('SUCCESS: all tasks completed') logging.info('concatenating {0} partial output files...'.format(len(done))) - paths = [os.path.join(simsdir,'sims.{0}-{1}.tsv'.format(start,end)) for start,end in done] - cmd = ['cat']+paths - subprocess.check_call(cmd,stdout=open(simsfile,'w')) + paths = [os.path.join(simsdir, 'sims.{0}-{1}.tsv'.format(start, end)) for start, end in done] + cmd = ['cat'] + paths + subprocess.check_call(cmd, stdout=open(simsfile, 'w')) logging.info('removing partial output files...') rmtree(simsdir) logging.info('loading %d items in %s model from %s', num_items, type(model).__name__, simsfile) - model.load_similarity_matrix(simsfile,num_items) - save_recommender(model,modelfile) + model.load_similarity_matrix(simsfile, num_items) + save_recommender(model, modelfile) logging.info('done') else: - logging.error('FAILED: {0}/{1} tasks did not complete successfully'.format(remaining,len(tasks))) + logging.error('FAILED: {0}/{1} tasks did not complete successfully'.format(remaining, len(tasks))) logging.error('try rerunning the command to retry the remaining tasks') - def find_done(self,outdir): - success_files = glob.glob(os.path.join(outdir,'*.SUCCESS')) + def find_done(self, outdir): + success_files = glob.glob(os.path.join(outdir, '*.SUCCESS')) r = re.compile('.*?([0-9]+)-([0-9]+)\.SUCCESS$') done = [] for path in success_files: m = r.match(path) start = int(m.group(1)) end = int(m.group(2)) - done.append((start,end)) + done.append((start, end)) return done - def create_tasks(self,model,input_format,trainfile,outdir,num_items,num_engines,max_similar_items,done): + def create_tasks(self, model, input_format, trainfile, outdir, num_items, num_engines, max_similar_items, done): if num_engines == 0: # special marker for sequential run num_engines = 1 - items_per_engine = int(math.ceil(float(num_items)/num_engines)) + items_per_engine = int(math.ceil(float(num_items) / num_engines)) tasks = [] - for start in xrange(0,num_items,items_per_engine): - end = min(num_items,start+items_per_engine) - if (start,end) not in done: - tasks.append((model,input_format,trainfile,outdir,start,end,max_similar_items)) + for start in range(0, num_items, items_per_engine): + end = min(num_items, start + items_per_engine) + if (start, end) not in done: + tasks.append((model, input_format, trainfile, outdir, start, end, max_similar_items)) return tasks + def process(task): """ Training task to run on an ipython engine. @@ -95,27 +96,27 @@ def process(task): import subprocess from mrec import load_fast_sparse_matrix - model,input_format,trainfile,outdir,start,end,max_similar_items = task + model, input_format, trainfile, outdir, start, end, max_similar_items = task # initialise the model - dataset = load_fast_sparse_matrix(input_format,trainfile) - if hasattr(model,'similarity_matrix'): + dataset = load_fast_sparse_matrix(input_format, trainfile) + if hasattr(model, 'similarity_matrix'): # clear out any existing similarity matrix to trigger recomputation of # the item-item similarities from the users' ratings. model.similarity_matrix = None # write sims directly to file as we compute them - outfile = os.path.join(outdir,'sims.{0}-{1}.tsv'.format(start,end)) - out = open(outfile,'w') - for j in xrange(start,end): - w = model.get_similar_items(j,max_similar_items=max_similar_items,dataset=dataset) - for k,v in w: - print >>out,'{0}\t{1}\t{2}'.format(j+1,k+1,v) # write as 1-indexed + outfile = os.path.join(outdir, 'sims.{0}-{1}.tsv'.format(start, end)) + out = open(outfile, 'w') + for j in range(start, end): + w = model.get_similar_items(j, max_similar_items=max_similar_items, dataset=dataset) + for k, v in w: + print('{0}\t{1}\t{2}'.format(j + 1, k + 1, v), file=out) # write as 1-indexed out.close() # record success - cmd = ['touch',os.path.join(outdir,'{0}-{1}.SUCCESS'.format(start,end))] + cmd = ['touch', os.path.join(outdir, '{0}-{1}.SUCCESS'.format(start, end))] subprocess.check_call(cmd) # return the range that we've processed - return start,end + return start, end diff --git a/mrec/parallel/predict.py b/mrec/parallel/predict.py index e9d5b40..3d16a5c 100644 --- a/mrec/parallel/predict.py +++ b/mrec/parallel/predict.py @@ -2,48 +2,46 @@ Prediction task to run on an ipython engine. """ -def run(task): +def run(task): # import modules required by engine import os import subprocess - import numpy as np - from scipy.sparse import coo_matrix from mrec import load_sparse_matrix, load_recommender - from mrec.evaluation import Evaluator - modelfile,input_format,trainfile,test_input_format,testfile,feature_format,featurefile,outdir,start,end,evaluator,generate = task + modelfile, input_format, trainfile, test_input_format, testfile, feature_format, featurefile, outdir, start, end, evaluator, generate = task # initialise the model model = load_recommender(modelfile) - outfile = os.path.join(outdir,'recs.{0}-{1}.tsv'.format(start,end)) + outfile = os.path.join(outdir, 'recs.{0}-{1}.tsv'.format(start, end)) if generate: # generate recommendations for our batch of users - dataset = load_sparse_matrix(input_format,trainfile) - out = open(outfile,'w') + dataset = load_sparse_matrix(input_format, trainfile) + out = open(outfile, 'w') if featurefile is not None: # currently runs much faster if features are loaded as a dense matrix - item_features = load_sparse_matrix(feature_format,featurefile).toarray() + item_features = load_sparse_matrix(feature_format, featurefile).toarray() # strip features for any trailing items that don't appear in training set num_items = dataset.shape[1] - item_features = item_features[:num_items,:] - recs = model.range_recommend_items(dataset,start,end,max_items=20,return_scores=True,item_features=item_features) + item_features = item_features[:num_items, :] + recs = model.range_recommend_items(dataset, start, end, max_items=20, return_scores=True, + item_features=item_features) else: - recs = model.range_recommend_items(dataset,start,end,max_items=20,return_scores=True) - for u,items in zip(xrange(start,end),recs): - for i,w in items: - print >>out,'{0}\t{1}\t{2}'.format(u+1,i+1,w) # write as 1-indexed + recs = model.range_recommend_items(dataset, start, end, max_items=20, return_scores=True) + for u, items in zip(range(start, end), recs): + for i, w in items: + print('{0}\t{1}\t{2}'.format(u + 1, i + 1, w), file=out) # write as 1-indexed out.close() # record success - cmd = ['touch',os.path.join(outdir,'{0}-{1}.SUCCESS'.format(start,end))] + cmd = ['touch', os.path.join(outdir, '{0}-{1}.SUCCESS'.format(start, end))] subprocess.check_call(cmd) # load the test data - testdata = load_sparse_matrix(test_input_format,testfile).tocsr() + testdata = load_sparse_matrix(test_input_format, testfile).tocsr() # return evaluation metrics - return evaluator.process(testdata,outfile,start,end) + return evaluator.process(testdata, outfile, start, end) diff --git a/mrec/parallel/warp.py b/mrec/parallel/warp.py index 840ff56..fba5b2e 100644 --- a/mrec/parallel/warp.py +++ b/mrec/parallel/warp.py @@ -1,15 +1,16 @@ import glob -import re +import logging import os +import re import subprocess from shutil import rmtree -import logging + import numpy as np from mrec import save_recommender, load_recommender -class WARPMFRunner(object): +class WARPMFRunner(object): def run(self, view, model, @@ -23,7 +24,7 @@ def run(self, modelfile): logging.info('creating models directory {0}...'.format(workdir)) - subprocess.check_call(['mkdir','-p',workdir]) + subprocess.check_call(['mkdir', '-p', workdir]) done = [] if not overwrite: @@ -44,7 +45,7 @@ def run(self, if tasks: logging.info('running in parallel across ipython engines...') - async_job = view.map_async(process,tasks,retries=2) + async_job = view.map_async(process, tasks, retries=2) # wait for tasks to complete results = async_job.get() @@ -59,22 +60,22 @@ def run(self, logging.info('SUCCESS: all tasks completed') logging.info('concatenating {0} models...'.format(len(done))) for ix in sorted(done): - partial_model = load_recommender(self.get_modelfile(ix,workdir)) + partial_model = load_recommender(self.get_modelfile(ix, workdir)) if ix == 0: model = partial_model else: # concatenate factors model.d += partial_model.d - model.U = np.hstack((model.U,partial_model.U)) - model.V = np.hstack((model.V,partial_model.V)) - if hasattr(model,'W'): - model.W = np.hstack((model.W,partial_model.W)) - save_recommender(model,modelfile) + model.U = np.hstack((model.U, partial_model.U)) + model.V = np.hstack((model.V, partial_model.V)) + if hasattr(model, 'W'): + model.W = np.hstack((model.W, partial_model.W)) + save_recommender(model, modelfile) logging.info('removing partial output files...') rmtree(workdir) logging.info('done') else: - logging.error('FAILED: {0}/{1} tasks did not complete successfully'.format(remaining,len(tasks))) + logging.error('FAILED: {0}/{1} tasks did not complete successfully'.format(remaining, len(tasks))) logging.error('try rerunning the command to retry the remaining tasks') def create_tasks(self, @@ -87,14 +88,14 @@ def create_tasks(self, num_engines, done): tasks = [] - for ix in xrange(num_engines): + for ix in range(num_engines): if ix not in done: - outfile = self.get_modelfile(ix,outdir) - tasks.append((model,input_format,trainfile,feature_format,featurefile,outfile,ix,num_engines)) + outfile = self.get_modelfile(ix, outdir) + tasks.append((model, input_format, trainfile, feature_format, featurefile, outfile, ix, num_engines)) return tasks - def find_done(self,outdir): - success_files = glob.glob(os.path.join(outdir,'*.SUCCESS')) + def find_done(self, outdir): + success_files = glob.glob(os.path.join(outdir, '*.SUCCESS')) r = re.compile('.*?([0-9]+)\.model\.npz\.SUCCESS$') done = [] for path in success_files: @@ -103,8 +104,9 @@ def find_done(self,outdir): done.append(ix) return done - def get_modelfile(self,ix,workdir): - return os.path.join(workdir,'{0}.model.npz'.format(ix)) + def get_modelfile(self, ix, workdir): + return os.path.join(workdir, '{0}.model.npz'.format(ix)) + def process(task): """ @@ -112,26 +114,25 @@ def process(task): """ # import modules required by engine - import os import subprocess from mrec import load_sparse_matrix, save_recommender - model,input_format,trainfile,feature_format,featurefile,outfile,offset,step = task + model, input_format, trainfile, feature_format, featurefile, outfile, offset, step = task - dataset = load_sparse_matrix(input_format,trainfile) + dataset = load_sparse_matrix(input_format, trainfile) if featurefile is not None: # currently runs much faster if features are loaded as a dense matrix - item_features = load_sparse_matrix(feature_format,featurefile).toarray() + item_features = load_sparse_matrix(feature_format, featurefile).toarray() # strip features for any trailing items that don't appear in training set num_items = dataset.shape[1] - item_features = item_features[:num_items,:] - model.fit(dataset,item_features=item_features) + item_features = item_features[:num_items, :] + model.fit(dataset, item_features=item_features) else: model.fit(dataset) - save_recommender(model,outfile) + save_recommender(model, outfile) # record success - cmd = ['touch','{0}.SUCCESS'.format(outfile)] + cmd = ['touch', '{0}.SUCCESS'.format(outfile)] subprocess.check_call(cmd) # return the offset for the samples that we've learned from diff --git a/mrec/parallel/wrmf.py b/mrec/parallel/wrmf.py index e2d0fc5..dd70118 100644 --- a/mrec/parallel/wrmf.py +++ b/mrec/parallel/wrmf.py @@ -1,77 +1,90 @@ import glob import logging +import math import os import subprocess from shutil import rmtree -import math + import numpy as np from mrec import load_sparse_matrix, save_recommender -def get_user_indices(data,u): + +def get_user_indices(data, u): # get (positive i.e. non-zero scored) items for user return data.X[u].nonzero()[1] -def get_item_indices(data,i): + +def get_item_indices(data, i): # get users for item return data.fast_get_col(i).nonzero()[0] -def get_factor_files(workdir,factor_type): + +def get_factor_files(workdir, factor_type): # return partial factor files in sorted order so they can simply be stacked - factor_files = glob.glob(os.path.join(workdir,'{0}.*.npy'.format(factor_type))) - return sorted(factor_files,key=lambda x: int(x[:-4][x[:-4].rfind('.')+1:])) + factor_files = glob.glob(os.path.join(workdir, '{0}.*.npy'.format(factor_type))) + return sorted(factor_files, key=lambda x: int(x[:-4][x[:-4].rfind('.') + 1:])) + def get_user_factor_files(workdir): - return get_factor_files(workdir,'U') + return get_factor_files(workdir, 'U') + def get_item_factor_files(workdir): - return get_factor_files(workdir,'V') + return get_factor_files(workdir, 'V') + -def init_item_factors(model,data): - num_users,num_items = data.shape +def init_item_factors(model, data): + num_users, num_items = data.shape return model.init_factors(num_items) -class WRMFRunner(object): - def run(self,view,model,input_format,trainfile,num_engines,workdir,modelfile): +class WRMFRunner(object): + def run(self, view, model, input_format, trainfile, num_engines, workdir, modelfile): logging.info('creating factors directory {0}'.format(workdir)) - subprocess.check_call(['mkdir','-p',workdir]) + subprocess.check_call(['mkdir', '-p', workdir]) logging.info('getting data size') - data = load_sparse_matrix(input_format,trainfile) - num_users,num_items = data.shape + data = load_sparse_matrix(input_format, trainfile) + num_users, num_items = data.shape del data - for it in xrange(model.num_iters): + for it in range(model.num_iters): logging.info('iteration {0}'.format(it)) - tasks = self.create_tasks(num_users,num_engines,model,input_format,trainfile,workdir,'U',get_user_indices,get_item_factor_files,init_item_factors) - self.run_tasks(view,tasks) - tasks = self.create_tasks(num_items,num_engines,model,input_format,trainfile,workdir,'V',get_item_indices,get_user_factor_files,None) # won't need to initialize user factors - self.run_tasks(view,tasks) + tasks = self.create_tasks(num_users, num_engines, model, input_format, trainfile, workdir, 'U', + get_user_indices, get_item_factor_files, init_item_factors) + self.run_tasks(view, tasks) + tasks = self.create_tasks(num_items, num_engines, model, input_format, trainfile, workdir, 'V', + get_item_indices, get_user_factor_files, + None) # won't need to initialize user factors + self.run_tasks(view, tasks) model.U = np.vstack([np.load(f) for f in get_user_factor_files(workdir)]) model.V = np.vstack([np.load(f) for f in get_item_factor_files(workdir)]) - save_recommender(model,modelfile) + save_recommender(model, modelfile) logging.info('removing partial output files') rmtree(workdir) logging.info('done') - def run_tasks(self,view,tasks): - async_job = view.map_async(compute_factors,tasks,retries=2) + def run_tasks(self, view, tasks): + async_job = view.map_async(compute_factors, tasks, retries=2) # wait for tasks to complete result = async_job.get() - def create_tasks(self,num_factors,num_engines,model,input_format,trainfile,workdir,factor_type,get_indices,get_fixed_factor_files,init_fixed_factors): - factors_per_engine = int(math.ceil(float(num_factors)/num_engines)) + def create_tasks(self, num_factors, num_engines, model, input_format, trainfile, workdir, factor_type, get_indices, + get_fixed_factor_files, init_fixed_factors): + factors_per_engine = int(math.ceil(float(num_factors) / num_engines)) tasks = [] - for start in xrange(0,num_factors,factors_per_engine): - end = min(num_factors,start+factors_per_engine) + for start in range(0, num_factors, factors_per_engine): + end = min(num_factors, start + factors_per_engine) fixed_factor_files = get_fixed_factor_files(workdir) - tasks.append((model,input_format,trainfile,factor_type,get_indices,init_fixed_factors,fixed_factor_files,start,end,workdir)) + tasks.append((model, input_format, trainfile, factor_type, get_indices, init_fixed_factors, + fixed_factor_files, start, end, workdir)) return tasks + def compute_factors(task): """ WRMF update method to run on an IPython engine. @@ -84,22 +97,22 @@ def compute_factors(task): import numpy as np from mrec import load_fast_sparse_matrix - model,input_format,trainfile,factor_type,get_indices,init_fixed_factors,fixed_factor_files,start,end,workdir = task + model, input_format, trainfile, factor_type, get_indices, init_fixed_factors, fixed_factor_files, start, end, workdir = task - data = load_fast_sparse_matrix(input_format,trainfile) + data = load_fast_sparse_matrix(input_format, trainfile) if fixed_factor_files: H = np.vstack([np.load(f) for f in fixed_factor_files]) else: - H = init_fixed_factors(model,data) + H = init_fixed_factors(model, data) HH = H.T.dot(H) - W = np.zeros(((end-start),model.d)) - for j in xrange(start,end): - indices = get_indices(data,j) + W = np.zeros(((end - start), model.d)) + for j in range(start, end): + indices = get_indices(data, j) if indices.size: - W[j-start,:] = model.update(indices,H,HH) + W[j - start, :] = model.update(indices, H, HH) - np.save(os.path.join(workdir,'{0}.{1}.npy'.format(factor_type,start)),W) + np.save(os.path.join(workdir, '{0}.{1}.npy'.format(factor_type, start)), W) - return start,end + return start, end diff --git a/mrec/popularity.py b/mrec/popularity.py index 9c04ee2..fb0a5fc 100644 --- a/mrec/popularity.py +++ b/mrec/popularity.py @@ -3,10 +3,9 @@ intended to provide a baseline for evaluations. """ -import numpy as np +from mrec.base_recommender import BaseRecommender +from mrec.sparse import fast_sparse_matrix -from base_recommender import BaseRecommender -from sparse import fast_sparse_matrix class ItemPopularityRecommender(BaseRecommender): """ @@ -28,14 +27,14 @@ class ItemPopularityRecommender(BaseRecommender): popularity. """ - def __init__(self,method='count',thresh=0): + def __init__(self, method='count', thresh=0): self.description = 'ItemPop' - if method not in ['count','sum','avg','thresh']: + if method not in ['count', 'sum', 'avg', 'thresh']: raise ValueError('invalid value for method parameter') self.method = method self.thresh = thresh - def fit(self,dataset,item_features=None): + def fit(self, dataset, item_features=None): """ Compute the most popular items using the method specified in the constructor. @@ -47,26 +46,26 @@ def fit(self,dataset,item_features=None): item_features : array_like, shape = [num_items, num_features] Features for items in training set, ignored here. """ - if isinstance(dataset,fast_sparse_matrix): + if isinstance(dataset, fast_sparse_matrix): d = dataset.X.tocsc() else: d = dataset.tocsc() if self.method == 'count': # count the total number of ratings for each item - popularity = [(d[:,i].nnz,i) for i in xrange(d.shape[1])] + popularity = [(d[:, i].nnz, i) for i in range(d.shape[1])] elif self.method == 'sum': # find the sum of the ratings for each item - popularity = [(d[:,i].sum(),i) for i in xrange(d.shape[1])] + popularity = [(d[:, i].sum(), i) for i in range(d.shape[1])] elif self.method == 'avg': # find the mean rating for each item - popularity = [(d[:,i].mean(),i) for i in xrange(d.shape[1])] + popularity = [(d[:, i].mean(), i) for i in range(d.shape[1])] elif self.method == 'thresh': # count the number of ratings above thresh for each item - popularity = [(sum(d[:,i].data>self.thresh),i) for i in xrange(d.shape[1])] + popularity = [(sum(d[:, i].data > self.thresh), i) for i in range(d.shape[1])] popularity.sort(reverse=True) - self.pop_items = [(i,c) for (c,i) in popularity] + self.pop_items = [(i, c) for (c, i) in popularity] - def recommend_items(self,dataset,u,max_items=10,return_scores=True,item_features=None): + def recommend_items(self, dataset, u, max_items=10, return_scores=True, item_features=None): """ Recommend new items for a user. Assumes you've already called fit(). @@ -93,10 +92,10 @@ def recommend_items(self,dataset,u,max_items=10,return_scores=True,item_features """ known_items = set(dataset[u].indices) recs = [] - for i,c in self.pop_items: + for i, c in self.pop_items: if i not in known_items: if return_scores: - recs.append((i,c)) + recs.append((i, c)) else: recs.append(i) if len(recs) >= max_items: diff --git a/mrec/reranking_recommender.py b/mrec/reranking_recommender.py index ade5912..ed262ff 100644 --- a/mrec/reranking_recommender.py +++ b/mrec/reranking_recommender.py @@ -3,13 +3,12 @@ and then reranks them using a matrix factorization model. """ -try: - import cPickle as pickle -except ImportError: - import pickle +import pickle + import numpy as np -from base_recommender import BaseRecommender +from mrec.base_recommender import BaseRecommender + class RerankingRecommender(BaseRecommender): """ @@ -28,31 +27,31 @@ class RerankingRecommender(BaseRecommender): The number of candidate items drawn from the first model for each user. """ - def __init__(self,item_similarity_recommender,mf_recommender,num_candidates=100): + def __init__(self, item_similarity_recommender, mf_recommender, num_candidates=100): self.item_similarity_recommender = item_similarity_recommender self.mf_recommender = mf_recommender self.num_candidates = num_candidates - self.description = 'RerankingRecommender({0},{1})'.format(self.item_similarity_recommender,self.mf_recommender) + self.description = 'RerankingRecommender({0},{1})'.format(self.item_similarity_recommender, self.mf_recommender) def _create_archive(self): archive = self.item_similarity_recommender._create_archive() archive['item_similarity_model'] = archive['model'] archive.update(self.mf_recommender._create_archive()) archive['mf_model'] = archive['model'] - tmp = self.item_similarity_recommender,self.mf_recommender + tmp = self.item_similarity_recommender, self.mf_recommender self.item_similarity_model = self.mf_recommender = None m = pickle.dumps(self) - self.item_similarity_model,self.mf_recommender = tmp + self.item_similarity_model, self.mf_recommender = tmp archive['model'] = m return archive - def _load_archive(self,archive): + def _load_archive(self, archive): self.item_similarity_recommender = np.loads(str(archive['item_similarity_model'])) self.item_similarity_recommender._load_archive(archive) self.mf_recommender = np.loads(str(archive['mf_model'])) self.mf_recommender._load_archive(archive) - def fit(self,train,item_features=None): + def fit(self, train, item_features=None): """ Fit both models to the training data. @@ -68,10 +67,10 @@ def fit(self,train,item_features=None): You are not obliged to call this, alternatively you can pass ready trained models to the RerankingRecommender constructor. """ - self.item_similarity_recommender.fit(train,item_features) - self.mf_recommender.fit(train,item_features) + self.item_similarity_recommender.fit(train, item_features) + self.mf_recommender.fit(train, item_features) - def rerank(self,u,candidates,max_items,return_scores): + def rerank(self, u, candidates, max_items, return_scores): """ Use latent factors to rerank candidate recommended items for a user and return the highest scoring. @@ -94,14 +93,14 @@ def rerank(self,u,candidates,max_items,return_scores): just a list of idxs. """ r = self.mf_recommender.U[u].dot(self.mf_recommender.V[candidates].T) - reranked = r.argsort()[:-1-max_items:-1] + reranked = r.argsort()[:-1 - max_items:-1] if return_scores: - recs = [(candidates[i],r[i]) for i in reranked] + recs = [(candidates[i], r[i]) for i in reranked] else: recs = [candidates[i] for i in reranked] return recs - def recommend_items(self,dataset,u,max_items=10,return_scores=True,item_features=None): + def recommend_items(self, dataset, u, max_items=10, return_scores=True, item_features=None): """ Recommend new items for a user. @@ -124,8 +123,9 @@ def recommend_items(self,dataset,u,max_items=10,return_scores=True,item_features List of (idx,score) pairs if return_scores is True, else just a list of idxs. """ - candidates = self.item_similarity_recommender.recommend_items(dataset,u,self.num_candidates,return_scores=False) - return self.rerank(u,candidates,max_items,return_scores=return_scores) + candidates = self.item_similarity_recommender.recommend_items(dataset, u, self.num_candidates, + return_scores=False) + return self.rerank(u, candidates, max_items, return_scores=return_scores) def batch_recommend_items(self, dataset, @@ -144,8 +144,6 @@ def batch_recommend_items(self, Maximum number of recommended items to return. return_scores : bool If true return a score along with each recommended item. - show_progress: bool - If true print something to stdout to show progress. item_features : array_like, shape = [num_items, num_features] Features for items in training set, required by some recommenders. @@ -155,9 +153,10 @@ def batch_recommend_items(self, Each entry is a list of (idx,score) pairs if return_scores is True, else just a list of idxs. """ - recs = self.item_similarity_recommender.batch_recommend_items(dataset,self.num_candidates,return_scores=False,item_features=item_features) - for u,candidates in enumerate(recs): - recs[u] = self.rerank(u,candidates,max_items,return_scores=return_scores) + recs = self.item_similarity_recommender.batch_recommend_items(dataset, self.num_candidates, return_scores=False, + item_features=item_features) + for u, candidates in enumerate(recs): + recs[u] = self.rerank(u, candidates, max_items, return_scores=return_scores) return recs def range_recommend_items(self, @@ -192,15 +191,17 @@ def range_recommend_items(self, Each entry is a list of (idx,score) pairs if return_scores is True, else just a list of idxs. """ - recs = self.item_similarity_recommender.range_recommend_items(dataset,user_start,user_end,self.num_candidates,return_scores=False,item_features=item_features) - for u,candidates in enumerate(recs): - recs[u] = self.rerank(user_start+u,candidates,max_items,return_scores=return_scores) + recs = self.item_similarity_recommender.range_recommend_items(dataset, user_start, user_end, + self.num_candidates, return_scores=False, + item_features=item_features) + for u, candidates in enumerate(recs): + recs[u] = self.rerank(user_start + u, candidates, max_items, return_scores=return_scores) return recs + def main(): import sys from mrec import load_sparse_matrix, save_recommender - from mrec.sparse import fast_sparse_matrix from mrec.item_similarity.knn import CosineKNNRecommender from mrec.mf.warp import WARPMFRecommender from mrec.reranking_recommender import RerankingRecommender @@ -210,16 +211,16 @@ def main(): outfile = sys.argv[3] # load training set as scipy sparse matrix - train = load_sparse_matrix(file_format,filepath) + train = load_sparse_matrix(file_format, filepath) item_sim_model = CosineKNNRecommender(k=100) - mf_model = WARPMFRecommender(d=80,gamma=0.01,C=100.0,max_iters=25000,validation_iters=1000,batch_size=10) - recommender = RerankingRecommender(item_sim_model,mf_model,num_candidates=100) + mf_model = WARPMFRecommender(d=80, gamma=0.01, C=100.0, max_iters=25000, validation_iters=1000, batch_size=10) + recommender = RerankingRecommender(item_sim_model, mf_model, num_candidates=100) recommender.fit(train) - save_recommender(recommender,outfile) + save_recommender(recommender, outfile) + if __name__ == '__main__': main() - diff --git a/mrec/sparse.py b/mrec/sparse.py index b08541e..f7884ae 100644 --- a/mrec/sparse.py +++ b/mrec/sparse.py @@ -3,11 +3,13 @@ """ import random + import numpy as np -from scipy.sparse import csr_matrix, coo_matrix from scipy.io import mmread +from scipy.sparse import csr_matrix, coo_matrix -def loadtxt(filepath,comments='#',delimiter=None,skiprows=0,usecols=None,index_offset=1): + +def loadtxt(filepath, comments='#', delimiter=None, skiprows=0, usecols=None, index_offset=1): """ Load a scipy sparse matrix from simply formatted data such as TSV, handles similar input to numpy.loadtxt(). @@ -36,16 +38,17 @@ def loadtxt(filepath,comments='#',delimiter=None,skiprows=0,usecols=None,index_o mat : scipy.sparse.csr_matrix The sparse matrix. """ - d = np.loadtxt(filepath,comments=comments,delimiter=delimiter,skiprows=skiprows,usecols=usecols) + d = np.loadtxt(filepath, comments=comments, delimiter=delimiter, skiprows=skiprows, usecols=usecols) if d.shape[1] < 3: raise ValueError('invalid number of columns in input') - row = d[:,0]-index_offset - col = d[:,1]-index_offset - data = d[:,2] - shape = (max(row)+1,max(col)+1) - return csr_matrix((data,(row,col)),shape=shape) + row = d[:, 0] - index_offset + col = d[:, 1] - index_offset + data = d[:, 2] + shape = (max(row) + 1, max(col) + 1) + return csr_matrix((data, (row, col)), shape=shape) + -def savez(d,file): +def savez(d, file): """ Save a sparse matrix to file in numpy binary format. @@ -58,7 +61,8 @@ def savez(d,file): where the matrix will be saved. If file is a string, the ``.npz`` extension will be appended to the file name if it is not already there. """ - np.savez(file,row=d.row,col=d.col,data=d.data,shape=d.shape) + np.savez(file, row=d.row, col=d.col, data=d.data, shape=d.shape) + def loadz(file): """ @@ -75,7 +79,8 @@ def loadz(file): The sparse matrix. """ y = np.load(file) - return coo_matrix((y['data'],(y['row'],y['col'])),shape=y['shape']) + return coo_matrix((y['data'], (y['row'], y['col'])), shape=y['shape']) + class fast_sparse_matrix(object): """ @@ -95,7 +100,8 @@ class fast_sparse_matrix(object): >>> col = fsm.fast_get_col(2) # get a column quickly >>> row = fsm.X[1] # get a row as usual """ - def __init__(self,X,col_view=None): + + def __init__(self, X, col_view=None): """ Create a fast_sparse_matrix from a csr_matrix X. Note that X is not copied and its values will be modified by @@ -126,7 +132,7 @@ def shape(self): """ return self.X.shape - def fast_get_col(self,j): + def fast_get_col(self, j): """ Return column j of the underlying matrix. @@ -140,11 +146,11 @@ def fast_get_col(self,j): col : scipy.sparse.csc_matrix Copy of column j of the matrix. """ - col = self.col_view[:,j].copy() + col = self.col_view[:, j].copy() col.data = self.X.data[col.data] return col - def fast_update_col(self,j,vals): + def fast_update_col(self, j, vals): """ Update values of existing non-zeros in column of the underlying matrix. @@ -159,10 +165,10 @@ def fast_update_col(self,j,vals): only change the value of existing non-zero entries of column j, it cannot add new ones. """ - dataptr = self.col_view[:,j].data + dataptr = self.col_view[:, j].data self.X.data[dataptr] = vals - def ensure_sparse_cols(self,max_density,remove_lowest=True): + def ensure_sparse_cols(self, max_density, remove_lowest=True): """ Ensure that no column of the matrix excess the specified density, setting excess entries to zero where necessary. @@ -191,19 +197,19 @@ def ensure_sparse_cols(self,max_density,remove_lowest=True): if max_density >= 1: max_nnz = int(max_density) else: - max_nnz = int(max_density*self.shape[0]) - for j in xrange(self.shape[1]): + max_nnz = int(max_density * self.shape[0]) + for j in range(self.shape[1]): col = self.fast_get_col(j) excess = col.nnz - max_nnz if excess > 0: if remove_lowest: zero_entries = np.argsort(col.data)[:excess] else: - zero_entries = random.sample(xrange(col.nnz),excess) + zero_entries = random.sample(range(col.nnz), excess) col.data[zero_entries] = 0 - self.fast_update_col(j,col.data) + self.fast_update_col(j, col.data) - def save(self,filepath): + def save(self, filepath): """ Save to file as arrays in numpy binary format. @@ -214,8 +220,8 @@ def save(self,filepath): """ d = self.X.tocoo(copy=False) v = self.col_view.tocoo(copy=False) - np.savez(filepath,row=d.row,col=d.col,data=d.data,shape=d.shape, - v_row=v.row,v_col=v.col,v_data=v.data,v_shape=v.shape) + np.savez(filepath, row=d.row, col=d.col, data=d.data, shape=d.shape, + v_row=v.row, v_col=v.col, v_data=v.data, v_shape=v.shape) @staticmethod def load(filepath): @@ -227,13 +233,13 @@ def load(filepath): filepath : str The filepath to load. """ - y = np.load(filepath,mmap_mode='r') - X = coo_matrix((y['data'],(y['row'],y['col'])),shape=y['shape']) - col_view = coo_matrix((y['v_data'],(y['v_row'],y['v_col'])),shape=y['v_shape']) - return fast_sparse_matrix(X,col_view.tocsc()) + y = np.load(filepath, mmap_mode='r') + X = coo_matrix((y['data'], (y['row'], y['col'])), shape=y['shape']) + col_view = coo_matrix((y['v_data'], (y['v_row'], y['v_col'])), shape=y['v_shape']) + return fast_sparse_matrix(X, col_view.tocsc()) @staticmethod - def loadtxt(filepath,comments='#',delimiter=None,skiprows=0,usecols=None,index_offset=1): + def loadtxt(filepath, comments='#', delimiter=None, skiprows=0, usecols=None, index_offset=1): """ Create a fast_sparse_matrix from simply formatted data such as TSV, handles similar input to numpy.loadtxt(). @@ -262,7 +268,7 @@ def loadtxt(filepath,comments='#',delimiter=None,skiprows=0,usecols=None,index_o mat : mrec.sparse.fast_sparse_matrix A fast_sparse_matrix holding the data in the file. """ - X = loadtxt(filepath,comments=comments,delimiter=delimiter,skiprows=skiprows,usecols=usecols) + X = loadtxt(filepath, comments=comments, delimiter=delimiter, skiprows=skiprows, usecols=usecols) return fast_sparse_matrix(X) @staticmethod @@ -282,4 +288,3 @@ def loadmm(filepath): """ X = mmread(filepath) return fast_sparse_matrix(X) - diff --git a/mrec/testing.py b/mrec/testing.py index 75c5945..41885b1 100644 --- a/mrec/testing.py +++ b/mrec/testing.py @@ -1,21 +1,23 @@ import random + import numpy as np from scipy.sparse import coo_matrix from sklearn.utils.testing import assert_array_equal -def get_random_coo_matrix(rows=3,cols=10,nnz=20): - row_col = random.sample(xrange(rows*cols),nnz) # ensure are unique + +def get_random_coo_matrix(rows=3, cols=10, nnz=20): + row_col = random.sample(range(rows * cols), nnz) # ensure are unique row = [i // cols for i in row_col] col = [i % cols for i in row_col] - data = np.random.randint(0,nnz*5,nnz) - return coo_matrix((data,(row,col)),shape=(rows,cols)) + data = np.random.randint(0, nnz * 5, nnz) + return coo_matrix((data, (row, col)), shape=(rows, cols)) -def assert_sparse_matrix_equal(X,Y): + +def assert_sparse_matrix_equal(X, Y): expected = X.toarray() actual = Y.toarray() # it's possible that we had trailing empty columns in X # - there's no way we can know about these sometimes e.g. # when reading back from file - expected = expected[:actual.shape[0],:actual.shape[1]] - assert_array_equal(expected,actual) - + expected = expected[:actual.shape[0], :actual.shape[1]] + assert_array_equal(expected, actual) diff --git a/mrec/tests/test_base_recommender.py b/mrec/tests/test_base_recommender.py index a75dea9..f61e408 100644 --- a/mrec/tests/test_base_recommender.py +++ b/mrec/tests/test_base_recommender.py @@ -1,70 +1,75 @@ -try: - import cPickle as pickle -except ImportError: - import pickle +import pickle import tempfile -import os + import numpy as np from nose.tools import assert_less_equal -from sklearn.utils.testing import assert_raises -from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_array_equal +from sklearn.utils.testing import assert_equal +from sklearn.utils.testing import assert_raises +from mrec.base_recommender import BaseRecommender from mrec.testing import get_random_coo_matrix -from mrec.base_recommender import BaseRecommender class MyRecommender(BaseRecommender): def __init__(self): self.foo = np.ndarray(range(10)) self.description = 'my recommender' + def _create_archive(self): tmp = self.foo self.foo = None m = pickle.dumps(self) self.foo = tmp - return {'model':m,'foo':self.foo} - def _load_archive(self,archive): + return {'model': m, 'foo': self.foo} + + def _load_archive(self, archive): self.foo = archive['foo'] + def save_load(r): - f,path = tempfile.mkstemp(suffix='.npz') + f, path = tempfile.mkstemp(suffix='.npz') r.save(path) return BaseRecommender.load(path) + def check_read_description(r): - f,path = tempfile.mkstemp(suffix='.npz') + f, path = tempfile.mkstemp(suffix='.npz') r.save(path) d = BaseRecommender.read_recommender_description(path) - assert_equal(str(r),d) + assert_equal(str(r), d) + def test_save_filepath_condition(): r = BaseRecommender() invalid_filepath = 'no suffix' - assert_raises(ValueError,r.save,invalid_filepath) + assert_raises(ValueError, r.save, invalid_filepath) + def test_save_load(): r = save_load(BaseRecommender()) - assert_equal(type(r),BaseRecommender) + assert_equal(type(r), BaseRecommender) r = MyRecommender() r2 = save_load(r) - assert_equal(type(r2),type(r)) - assert_array_equal(r2.foo,r.foo) - assert_equal(r2.description,r.description) + assert_equal(type(r2), type(r)) + assert_array_equal(r2.foo, r.foo) + assert_equal(r2.description, r.description) + def test_read_recommender_description(): check_read_description(BaseRecommender()) check_read_description(MyRecommender()) + def test_zero_known_item_scores(): train = get_random_coo_matrix().tocsr() predictions = np.random.random_sample(train.shape) r = BaseRecommender() - safe = r._zero_known_item_scores(predictions,train) - num_users,num_items = predictions.shape - for u in xrange(num_users): - for i in xrange(num_items): + safe = r._zero_known_item_scores(predictions, train) + num_users, num_items = predictions.shape + for u in range(num_users): + for i in range(num_items): if i in train[u].indices: - assert_less_equal(safe[u,i],0) + assert_less_equal(safe[u, i], 0) else: - assert_equal(safe[u,i],predictions[u,i]) + assert_equal(safe[u, i], predictions[u, i]) diff --git a/mrec/tests/test_mrec.py b/mrec/tests/test_mrec.py index 09291b5..669194e 100644 --- a/mrec/tests/test_mrec.py +++ b/mrec/tests/test_mrec.py @@ -1,23 +1,23 @@ -import tempfile import os - -from mrec.testing import get_random_coo_matrix -from mrec.testing import assert_sparse_matrix_equal +import tempfile from mrec import load_sparse_matrix from mrec import save_sparse_matrix +from mrec.testing import assert_sparse_matrix_equal +from mrec.testing import get_random_coo_matrix + def test_save_load_sparse_matrix(): X = get_random_coo_matrix() - for fmt in ['tsv','csv','npz','mm','fsm']: + for fmt in ['tsv', 'csv', 'npz', 'mm', 'fsm']: if fmt == 'mm': suffix = '.mtx' elif fmt == 'npz' or fmt == 'fsm': suffix = '.npz' else: suffix = '' - f,path = tempfile.mkstemp(suffix=suffix) - save_sparse_matrix(X,fmt,path) - Y = load_sparse_matrix(fmt,path) - assert_sparse_matrix_equal(X,Y) + f, path = tempfile.mkstemp(suffix=suffix) + save_sparse_matrix(X, fmt, path) + Y = load_sparse_matrix(fmt, path) + assert_sparse_matrix_equal(X, Y) os.remove(path) diff --git a/mrec/tests/test_sparse.py b/mrec/tests/test_sparse.py index 1b5f931..470eb93 100644 --- a/mrec/tests/test_sparse.py +++ b/mrec/tests/test_sparse.py @@ -1,75 +1,80 @@ -import tempfile import os -from sklearn.utils.testing import assert_equal -from sklearn.utils.testing import assert_array_equal +import tempfile -from mrec.testing import get_random_coo_matrix -from mrec.testing import assert_sparse_matrix_equal +from sklearn.utils.testing import assert_array_equal +from sklearn.utils.testing import assert_equal +from mrec.sparse import fast_sparse_matrix from mrec.sparse import loadtxt -from mrec.sparse import savez from mrec.sparse import loadz -from mrec.sparse import fast_sparse_matrix +from mrec.sparse import savez +from mrec.testing import assert_sparse_matrix_equal +from mrec.testing import get_random_coo_matrix + def test_loadtxt(): X = get_random_coo_matrix() - f,path = tempfile.mkstemp(suffix='.npz') - with open(path,'w') as f: - for i,j,v in zip(X.row,X.col,X.data): - print >>f,'{0}\t{1}\t{2}'.format(i+1,j+1,v) + f, path = tempfile.mkstemp(suffix='.npz') + with open(path, 'w') as f: + for i, j, v in zip(X.row, X.col, X.data): + print('{0}\t{1}\t{2}'.format(i + 1, j + 1, v), file=f) Y = loadtxt(path) os.remove(path) - assert_sparse_matrix_equal(X,Y) + assert_sparse_matrix_equal(X, Y) + def test_savez_loadz(): m = get_random_coo_matrix() - f,path = tempfile.mkstemp(suffix='.npz') - savez(m,path) + f, path = tempfile.mkstemp(suffix='.npz') + savez(m, path) n = loadz(path) os.remove(path) - assert_array_equal(n.toarray(),m.toarray()) + assert_array_equal(n.toarray(), m.toarray()) + def test_init_fast_sparse_matrix(): X = get_random_coo_matrix() Y = X.tocsr() Z = X.tocsc() - for M in [X,Y,Z]: + for M in [X, Y, Z]: m = fast_sparse_matrix(M) - assert_array_equal(m.X.toarray(),M.toarray()) - assert_equal(m.shape,M.shape) + assert_array_equal(m.X.toarray(), M.toarray()) + assert_equal(m.shape, M.shape) + def test_fast_get_col(): X = get_random_coo_matrix().tocsc() m = fast_sparse_matrix(X) - rows,cols = X.shape - for j in xrange(cols): - assert_array_equal(m.fast_get_col(j).toarray(),X[:,j].toarray()) + rows, cols = X.shape + for j in range(cols): + assert_array_equal(m.fast_get_col(j).toarray(), X[:, j].toarray()) + def test_fast_update_col(): X = get_random_coo_matrix().tocsc() m = fast_sparse_matrix(X) cols = X.shape[1] - for j in xrange(cols): + for j in range(cols): vals = m.fast_get_col(j).data - if (vals==0).all(): + if (vals == 0).all(): continue - vals[vals!=0] += 1 - m.fast_update_col(j,vals) - expected = X[:,j].toarray() - for i in xrange(expected.shape[0]): + vals[vals != 0] += 1 + m.fast_update_col(j, vals) + expected = X[:, j].toarray() + for i in range(expected.shape[0]): if expected[i] != 0: expected[i] += 1 - assert_array_equal(m.fast_get_col(j).toarray(),expected) + assert_array_equal(m.fast_get_col(j).toarray(), expected) + def test_save_load(): """Save to file as arrays in numpy binary format.""" X = get_random_coo_matrix() m = fast_sparse_matrix(X) - f,path = tempfile.mkstemp(suffix='.npz') + f, path = tempfile.mkstemp(suffix='.npz') m.save(path) n = fast_sparse_matrix.load(path) os.remove(path) - assert_equal(m.shape,n.shape) - assert_array_equal(m.X.toarray(),n.X.toarray()) - assert_array_equal(m.col_view.toarray(),n.col_view.toarray()) - + assert_equal(m.shape, n.shape) + assert_array_equal(m.X.toarray(), n.X.toarray()) + assert_array_equal(m.col_view.toarray(), n.col_view.toarray()) diff --git a/setup.py b/setup.py index 2447a0f..a054a66 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,8 @@ install_requires=['numpy', 'scipy', 'scikit-learn', - 'ipython <= 4.0.0', + 'nose', + 'ipyparallel', 'cython', 'psutil'], entry_points={