#!/usr/bin/env python3 ####################################################################### # try and extract url data from wikipedia # write out as we go version # # Author: Garry Morrison # email: garry@semantic-db.org # Date: 2015-05-26 # Update: 2015-07-21 # Copyright: closed for now # # Usage: ./play_with_wikipeida__fast_write.py data/fragments/0.xml # ####################################################################### import sys from bs4 import BeautifulSoup document = open(sys.argv[1],'rb') soup = BeautifulSoup(document) from the_semantic_db_code import * from the_semantic_db_functions import * from the_semantic_db_processor import * C = context_list("wikipedia links") if not os.path.exists("sw-results"): os.makedirs("sw-results") prefix = sys.argv[1].rsplit('/',1)[-1].split('.')[0] destination = "sw-results/" + prefix + "--30k--wikipedia-links.sw" print(destination) #sys.exit(0) def ascii_filter(s): # filter down to ascii only: return s.encode('ascii','ignore').decode('ascii') # delete/escape chars we don't want inside kets: def chomp_bad_chars(s): # filter down to ascii only, escape \n, delete \r: s = s.encode('ascii','ignore').decode('ascii').replace('\n','\\n').replace('\r','') # some escape: s = s.replace('&','&').replace('<','<').replace('>','>') # some more escapes: s = s.replace(':',':').replace('|','&pipe;') return s def extract_links(s): r = [] while True: try: head, tail = s.split('[[',1) fragment, s = tail.split(']]',1) r.append(fragment) except: break return r def process_anchor_text(s): try: link, anchor = s.split('|',1) except: link = s anchor = link link = link.replace(' ','_') return link, anchor def ket_process_anchor_text(s): try: link, anchor = s.split('|',1) except: link = s anchor = link link = link.replace(' ','_') link = chomp_bad_chars(link) anchor = chomp_bad_chars(anchor) return ket("WP: " + link), ket("anchor: " + anchor) def ket_process_link(s): try: link, anchor = s.split('|',1) except: link = s link = link.replace(' ','_') #.lower() # lower case too? Yup! (I think ...) Nope! Quick test, results are bad. Titles are hard to read in lowercase. link = chomp_bad_chars(link) return ket("WP: " + link) print(soup.sitename) dest = open(destination,'w') for page in soup.find_all('page'): try: print("title:",page.title) text = ascii_filter(page.find('text').text) # print("text:",text) r = extract_links(text) # print("r:",r) page_name_ket = ket("WP: " + page.title.text.replace(' ','_')) #.lower()) print("name:",page_name_ket) print() # result = superposition() # result.data = [ket_process_link(x) for x in r] # what if duplicate links? # result = superposition() # not sure how slow this will be. # for x in r: # result += ket_process_link(x) result = fast_superposition() for x in r: result += ket_process_link(x) dest.write("links-to " + str(page_name_ket) + " => " + str(result.superposition().coeff_sort()) + "\n\n") except: continue dest.close()