#!/usr/bin/env python3 ####################################################################### # try and extract url data from wikipedia # # Author: Garry Morrison # email: garry@semantic-db.org # Date: 2015-05-24 # Update: # Copyright: closed for now # # Usage: ./play_with_wikipeida.py wiki.xml # ####################################################################### import sys from bs4 import BeautifulSoup document = open(sys.argv[1],'rb') soup = BeautifulSoup(document) from the_semantic_db_code import * from the_semantic_db_functions import * from the_semantic_db_processor import * C = context_list("wikipedia links") def ascii_filter(s): # filter down to ascii only: return s.encode('ascii','ignore').decode('ascii') # delete/escape chars we don't want inside kets: def chomp_bad_chars(s): # filter down to ascii only, escape \n, delete \r: s = s.encode('ascii','ignore').decode('ascii').replace('\n','\\n').replace('\r','') # some escape: s = s.replace('&','&').replace('<','<').replace('>','>') # some more escapes: s = s.replace(':',':').replace('|','&pipe;') return s def extract_links(s): r = [] while True: try: head, tail = s.split('[[',1) fragment, s = tail.split(']]',1) r.append(fragment) except: break return r def process_anchor_text(s): try: link, anchor = s.split('|',1) except: link = s anchor = link link = link.replace(' ','_') return link, anchor def ket_process_anchor_text(s): try: link, anchor = s.split('|',1) except: link = s anchor = link link = link.replace(' ','_') link = chomp_bad_chars(link) anchor = chomp_bad_chars(anchor) return ket("WP: " + link), ket("anchor: " + anchor) #r = ['political philosophy', 'stateless society|stateless societies', 'self-governance|self-governed', 'Hierarchy|hierarchical', 'Free association (communism and anarchism)|free associations', 'Peter Kropotkin', 'An Anarchist FAQ', 'state (polity)|state', 'The Globe and Mail'] #for x in r: # print("x:",x) ## print("links:",process_anchor_text(x)) # link, anchor = ket_process_anchor_text(x) # print("link:",link) # print("anchor:",anchor) #sys.exit(0) print(soup.sitename) #print(soup.prettify()) #print(soup.page.text) for page in soup.find_all('page'): try: print("title:",page.title) text = ascii_filter(page.find('text').text) # print("text:",text) r = extract_links(text) # print("r:",r) page_name_ket = ket("WP: " + page.title.text.replace(' ','_')) print("name:",page_name_ket) print() # now learn it all: for x in r: link, anchor = ket_process_anchor_text(x) C.add_learn("links-to",page_name_ket,link) C.add_learn("inverse-links-to",link,page_name_ket) C.add_learn("contains-anchor",page_name_ket,anchor) C.add_learn("inverse-contains-anchor",anchor,page_name_ket) C.add_learn("links-to",anchor,link) C.add_learn("inverse-links-to",link,anchor) except: continue #print(C.dump_universe()) save_sw(C,"wikipedia-links.sw",False)