Example NL extractor 1

Allikas: Kursused
Redaktsioon seisuga 20. oktoober 2015, kell 15:37 kasutajalt Tanel (arutelu | kaastöö) (Uus lehekülg: ' <pre> intxt="""Barack Obama went to China yesterday. He lives in Grand Hyatt Beijing. This is a superb hotel.""" nertable=[ [["Barack","Obama"],"Barack Obama","ner_noun","...')
(erin) ←Vanem redaktsioon | Viimane redaktsiooni (erin) | Uuem redaktsioon→ (erin)
Mine navigeerimisribale Mine otsikasti

intxt="""Barack Obama went to China yesterday. 
He lives in Grand Hyatt Beijing. This is a superb hotel.""" 

nertable=[
  [["Barack","Obama"],"Barack Obama","ner_noun","http://en.wikipedia.org/wiki/Barack_Obama","person"],
  [["China"],"China","ner_noun","http://en.wikipedia.org/wiki/China","country"],
  [["Grand","Hyatt","Beijing"],"Grand Hyatt Beijing","ner_noun","https://en.wikipedia.org/wiki/Grand_Hyatt_Beijing","company"]
]  

postable=[
  [["went"],"go","verb","http://conceptnet5.media.mit.edu/data/5.3/c/en/go","past"],
  [["to"],"to","preposition","http://conceptnet5.media.mit.edu/data/5.3/c/en/to",None],
  [["yesterday"],"yesterday","adverb","http://conceptnet5.media.mit.edu/data/5.3/c/en/yesterday",None],
  [["this"],"this","adjective","http://conceptnet5.media.mit.edu/data/5.3/c/en/this",None]
]  

# [barack,action1,china]   "to china", "went ... yesterday"
# [action1,activity,moveto]
# [action1,time,past]

# [he,action2, grandhyattbeijing]
# [action2,activity,live_in]
# [action2,time,current]

# TODO:
#sentencetable=[
#  [["noun","verb","noun"],[[0,1,2]]]
  
def main(txt):
  splitted=split_text(txt)
  print("splitted:")
  print(splitted)
  nerred=ner_text(splitted)
  print("nerred:")
  print(nerred)
  posed=pos_text(nerred)
  print("posed:")
  print(posed)
  pretty_print(posed)
  
def ner_text(slst):
  rlst=[]
  for sent in slst:
    srlst=[]
    i=0
    while i<len(sent):
      tmp=sent_has_name_at(sent,i)
      if tmp:
        srlst.append(tmp[0])
        i=tmp[1]
      else:
        srlst.append(sent[i])
      i+=1  
    rlst.append(srlst)
  return rlst

def sent_has_name_at(sent,i):
  if not sent: return 0
  if i>=len(sent): return 0
  for known in nertable:
    phrase=known[0]
    j=0
    while j<len(phrase):
      if i+j>=len(sent): break
      if sent[i+j]!=phrase[j]:
        break
      j+=1
    if j==len(phrase):
      res=[known,i+len(phrase)-1]
      return res



def pos_text(slst):
  rlst=[]
  for sent in slst:
    srlst=[]
    i=0
    while i<len(sent):
      if type(sent[i])==type([0]): 
        srlst.append(sent[i])
        i+=1
        continue
      tmp=sent_has_pos_at(sent,i)
      if tmp:
        srlst.append(tmp[0])
        i=tmp[1]
      else:
        srlst.append(sent[i])
      i+=1  
    rlst.append(srlst)
  return rlst

def sent_has_pos_at(sent,i):
  if not sent: return 0
  if i>=len(sent): return 0
  for known in postable:
    phrase=known[0]
    j=0
    while j<len(phrase):
      if i+j>=len(sent): break
      if sent[i+j]!=phrase[j]:
        break
      j+=1
    if j==len(phrase):
      res=[known,i+len(phrase)-1]
      return res

def split_text(txt):
  sentlst=txt.replace(","," ").split(".")
  wlst=[]
  for s in sentlst:
    if not s: continue
    sp=s.replace("."," ").replace("\n"," ").split(" ")
    tmp=[]
    for w in sp:
      w1=w.strip()
      if w1: tmp.append(w1)      
    wlst.append(tmp)
  return wlst

def pretty_print(sentlst):
  for sent in sentlst:
    print("sentence: ")
    for phrase in sent:
      print("  "+str(phrase)) 

main(intxt)