#!/opt/python3/bin/python3 import sys from the_semantic_db_code import * from the_semantic_db_functions import * from the_semantic_db_processor import * C = context_list("find all average movie rating") #one = sys.argv[1] def file_recall(filename,op,label): if type(label) == ket: coeff = label.value ket_label = label.label else: coeff = 1 ket_label = label pattern = op + " |" + ket_label + "> => " n = len(pattern) # print("pattern:",pattern) # print("n: ",n) with open(filename,'r') as f: for line in f: if line.startswith(pattern): # print("line:",line) return extract_literal_superposition(line[n:])[0].multiply(coeff) return ket("",0) #imdb_sw = "sw-examples/imdb.sw" # this should be half the size, and so twice the speed! imdb_sw = "sw-examples/imdb-sans-actors.sw" # this file only has movies with 10k or more votes: #ratings_sw = "sw-examples/imdb-ratings.sw" # this is the full set: #votes_sw = "sw-examples/complete-imdb-ratings.sw" # this is the full set, but only imdb-votes-self entries, again for speed votes_sw = "sw-examples/imdb-votes-self-only.sw" # this is the full set, but only imdb-rating-self entries, to speed it up by roughly 4 I think. ratings_sw = "sw-examples/imdb-ratings-self-only.sw" # bah. Let's see if pre-loading votes and ratings is faster load_sw(C,votes_sw) load_sw(C,ratings_sw) actors_file = "imdb-actors-list.txt" # test file. Seems to work! actors_file = "short-actors.txt" ave_file = "all-actors-average.txt" wave_file = "all-actors-weighted-average.txt" ave_f = open(ave_file,'w') wave_f = open(wave_file,'w') # fuck having list of actors, then iterate over full imdb file. O(n^2) for the fail. # so just go through the imdb.sw file once. Should be much faster! imdb_data = "sw-examples/imdb-sans-movies.sw" #imdb_data = "sw-examples/trial.sw" with open(imdb_data,'r') as f: for line in f: try: head, tail = line.rstrip().split(" => ",1) actor = head[15:-1] # "movies |actor: " + actor + ">" raw_movies = superposition() for x in tail[1:-1].split("> + |"): raw_movies += ket(x) raw_movies = raw_movies.apply_sigmoid(clean) # in memory version: movie_ratings = raw_movies.apply_op(C,"imdb-rating-self") movie_votes = raw_movies.apply_op(C,"imdb-votes-self") # file version: # movie_ratings = superposition() # movie_votes = superposition() # for x in raw_movies.data: # movie_ratings += file_recall(ratings_sw,"imdb-rating-self",x) # movie_votes += file_recall(votes_sw,"imdb-votes-self",x) # find the average movie rating for this actor: count = movie_ratings.count() count_sum = movie_ratings.count_sum() average = 0 if count > 0: average = count_sum / count # find the weighted average movie rating for this actor: weighted_votes = multiply(movie_votes,movie_ratings) # I think this is the first use of multiply superpostions together. weighted_count = movie_votes.count_sum() weighted_count_sum = weighted_votes.count_sum() weighted_average = 0 if weighted_count > 0: weighted_average = weighted_count_sum / weighted_count # save results ave_f.write("%.2f\t" % average + actor + "\n") wave_f.write("%.2f\t" % weighted_average + actor + "\n") except: continue ave_f.close() wave_f.close() sys.exit(0) print("============================") print("actor:",one) print("number of movies:",len(movie_ratings)) print("ratings:") print(movie_ratings.long_display()) print("votes:") print(movie_votes.long_display()) print("average movie rating:","%.2f" % average) print("weighted average move rating:","%.2f" % weighted_average)