Example NL extractor 1
Mine navigeerimisribale
Mine otsikasti
intxt="""Barack Obama went to China yesterday.
He lives in Grand Hyatt Beijing. This is a superb hotel."""
nertable=[
[["Barack","Obama"],"Barack Obama","ner_noun","http://en.wikipedia.org/wiki/Barack_Obama","person"],
[["China"],"China","ner_noun","http://en.wikipedia.org/wiki/China","country"],
[["Grand","Hyatt","Beijing"],"Grand Hyatt Beijing","ner_noun","https://en.wikipedia.org/wiki/Grand_Hyatt_Beijing","company"]
]
postable=[
[["went"],"go","verb","http://conceptnet5.media.mit.edu/data/5.3/c/en/go","past"],
[["to"],"to","preposition","http://conceptnet5.media.mit.edu/data/5.3/c/en/to",None],
[["yesterday"],"yesterday","adverb","http://conceptnet5.media.mit.edu/data/5.3/c/en/yesterday",None],
[["this"],"this","adjective","http://conceptnet5.media.mit.edu/data/5.3/c/en/this",None]
]
# [barack,action1,china] "to china", "went ... yesterday"
# [action1,activity,moveto]
# [action1,time,past]
# [he,action2, grandhyattbeijing]
# [action2,activity,live_in]
# [action2,time,current]
# TODO:
#sentencetable=[
# [["noun","verb","noun"],[[0,1,2]]]
def main(txt):
splitted=split_text(txt)
print("splitted:")
print(splitted)
nerred=ner_text(splitted)
print("nerred:")
print(nerred)
posed=pos_text(nerred)
print("posed:")
print(posed)
pretty_print(posed)
def ner_text(slst):
rlst=[]
for sent in slst:
srlst=[]
i=0
while i<len(sent):
tmp=sent_has_name_at(sent,i)
if tmp:
srlst.append(tmp[0])
i=tmp[1]
else:
srlst.append(sent[i])
i+=1
rlst.append(srlst)
return rlst
def sent_has_name_at(sent,i):
if not sent: return 0
if i>=len(sent): return 0
for known in nertable:
phrase=known[0]
j=0
while j<len(phrase):
if i+j>=len(sent): break
if sent[i+j]!=phrase[j]:
break
j+=1
if j==len(phrase):
res=[known,i+len(phrase)-1]
return res
def pos_text(slst):
rlst=[]
for sent in slst:
srlst=[]
i=0
while i<len(sent):
if type(sent[i])==type([0]):
srlst.append(sent[i])
i+=1
continue
tmp=sent_has_pos_at(sent,i)
if tmp:
srlst.append(tmp[0])
i=tmp[1]
else:
srlst.append(sent[i])
i+=1
rlst.append(srlst)
return rlst
def sent_has_pos_at(sent,i):
if not sent: return 0
if i>=len(sent): return 0
for known in postable:
phrase=known[0]
j=0
while j<len(phrase):
if i+j>=len(sent): break
if sent[i+j]!=phrase[j]:
break
j+=1
if j==len(phrase):
res=[known,i+len(phrase)-1]
return res
def split_text(txt):
sentlst=txt.replace(","," ").split(".")
wlst=[]
for s in sentlst:
if not s: continue
sp=s.replace("."," ").replace("\n"," ").split(" ")
tmp=[]
for w in sp:
w1=w.strip()
if w1: tmp.append(w1)
wlst.append(tmp)
return wlst
def pretty_print(sentlst):
for sent in sentlst:
print("sentence: ")
for phrase in sent:
print(" "+str(phrase))
main(intxt)