Example NL extractor 1
Mine navigeerimisribale
Mine otsikasti
intxt="""Barack Obama went to China yesterday. He lives in Grand Hyatt Beijing. This is a superb hotel.""" nertable=[ [["Barack","Obama"],"Barack Obama","ner_noun","http://en.wikipedia.org/wiki/Barack_Obama","person"], [["China"],"China","ner_noun","http://en.wikipedia.org/wiki/China","country"], [["Grand","Hyatt","Beijing"],"Grand Hyatt Beijing","ner_noun","https://en.wikipedia.org/wiki/Grand_Hyatt_Beijing","company"] ] postable=[ [["went"],"go","verb","http://conceptnet5.media.mit.edu/data/5.3/c/en/go","past"], [["to"],"to","preposition","http://conceptnet5.media.mit.edu/data/5.3/c/en/to",None], [["yesterday"],"yesterday","adverb","http://conceptnet5.media.mit.edu/data/5.3/c/en/yesterday",None], [["this"],"this","adjective","http://conceptnet5.media.mit.edu/data/5.3/c/en/this",None] ] # [barack,action1,china] "to china", "went ... yesterday" # [action1,activity,moveto] # [action1,time,past] # [he,action2, grandhyattbeijing] # [action2,activity,live_in] # [action2,time,current] # TODO: #sentencetable=[ # [["noun","verb","noun"],[[0,1,2]]] def main(txt): splitted=split_text(txt) print("splitted:") print(splitted) nerred=ner_text(splitted) print("nerred:") print(nerred) posed=pos_text(nerred) print("posed:") print(posed) pretty_print(posed) def ner_text(slst): rlst=[] for sent in slst: srlst=[] i=0 while i<len(sent): tmp=sent_has_name_at(sent,i) if tmp: srlst.append(tmp[0]) i=tmp[1] else: srlst.append(sent[i]) i+=1 rlst.append(srlst) return rlst def sent_has_name_at(sent,i): if not sent: return 0 if i>=len(sent): return 0 for known in nertable: phrase=known[0] j=0 while j<len(phrase): if i+j>=len(sent): break if sent[i+j]!=phrase[j]: break j+=1 if j==len(phrase): res=[known,i+len(phrase)-1] return res def pos_text(slst): rlst=[] for sent in slst: srlst=[] i=0 while i<len(sent): if type(sent[i])==type([0]): srlst.append(sent[i]) i+=1 continue tmp=sent_has_pos_at(sent,i) if tmp: srlst.append(tmp[0]) i=tmp[1] else: srlst.append(sent[i]) i+=1 rlst.append(srlst) return rlst def sent_has_pos_at(sent,i): if not sent: return 0 if i>=len(sent): return 0 for known in postable: phrase=known[0] j=0 while j<len(phrase): if i+j>=len(sent): break if sent[i+j]!=phrase[j]: break j+=1 if j==len(phrase): res=[known,i+len(phrase)-1] return res def split_text(txt): sentlst=txt.replace(","," ").split(".") wlst=[] for s in sentlst: if not s: continue sp=s.replace("."," ").replace("\n"," ").split(" ") tmp=[] for w in sp: w1=w.strip() if w1: tmp.append(w1) wlst.append(tmp) return wlst def pretty_print(sentlst): for sent in sentlst: print("sentence: ") for phrase in sent: print(" "+str(phrase)) main(intxt)