Temporary draft by tt

Allikas: Kursused
Redaktsioon seisuga 27. oktoober 2015, kell 17:21 kasutajalt Tanel (arutelu | kaastöö) (Uus lehekülg: ' intxt="""Barack Obama went to China yesterday. He lives in Grand Hyatt Beijing. This is a superb hotel.""" intxt="""Barack Obama went to China yesterday. He lives in Grand H...')
(erin) ←Vanem redaktsioon | Viimane redaktsiooni (erin) | Uuem redaktsioon→ (erin)
Mine navigeerimisribale Mine otsikasti

intxt="""Barack Obama went to China yesterday. He lives in Grand Hyatt Beijing. This is a superb hotel."""

intxt="""Barack Obama went to China yesterday. He lives in Grand Hyatt Beijing."""

nertable=[

 [["Barack","Obama"],"Barack Obama","ner_noun","http://en.wikipedia.org/wiki/Barack_Obama",
     "http://conceptnet5.media.mit.edu/web/c/en/person"],
 [["China"],"China","ner_noun","http://en.wikipedia.org/wiki/China",
      "http://conceptnet5.media.mit.edu/web/c/en/country"],
 [["Grand","Hyatt","Beijing"],"Grand Hyatt Beijing","ner_noun","https://en.wikipedia.org/wiki/Grand_Hyatt_Beijing",
      "http://conceptnet5.media.mit.edu/web/c/en/company"]

]

postable=[

 [["went"],"go","verb","http://conceptnet5.media.mit.edu/web/c/en/go","past"],
 [["to"],"to","preposition","http://conceptnet5.media.mit.edu/web/c/en/to",None],
 [["yesterday"],"yesterday","adverb","http://conceptnet5.media.mit.edu/web/c/en/yesterday",None],
 [["he"],"he","pronoun","http://conceptnet5.media.mit.edu/data/web/c/en/this",None],
 [["lives"],"live","verb","http://conceptnet5.media.mit.edu/web/c/en/live",None],
 [["in"],"in","preposition","http://conceptnet5.media.mit.edu/web/c/en/in",None],
 [["this"],"this","pronoun","http://conceptnet5.media.mit.edu/web/c/en/this",None],
 [["is"],"be","verb","http://conceptnet5.media.mit.edu/web/c/en/type/v/identify_as_belonging_to_a_certain_type",None],
 [["superb"],"superb","adjective","http://conceptnet5.media.mit.edu/web/c/en/superb",None],
 [["hotel"],"hotel","noun","http://conceptnet5.media.mit.edu/web/c/en/hotel",None]

]

idnum=0

  1. [barack,action1,china] "to china", "went ... yesterday"
  2. [action1,activity,moveto]
  3. [action1,time,past]
  1. [he,action2, grandhyattbeijing]
  2. [action2,activity,live_in]
  3. [action2,time,current]
  1. TODO:
  2. sentencetable=[
  3. [["noun","verb","noun"],0,1,2]

def main(txt):

 splitted=split_text(txt)
 print("splitted:")
 print(splitted)
 nerred=ner_text(splitted)
 print("nerred:")
 print(nerred)
 posed=pos_text(nerred)
 print("posed:")
 print(posed)
 pretty_print(posed)
 rdf=simple_rdf(posed)
 print("rdf:")
 print(rdf)
 pretty_print(rdf)
 

def ner_text(slst):

 rlst=[]
 for sent in slst:
   srlst=[]
   i=0
   while i<len(sent):
     tmp=sent_has_name_at(sent,i)
     if tmp:
       srlst.append(tmp[0])
       i=tmp[1]
     else:
       srlst.append(sent[i])
     i+=1  
   rlst.append(srlst)
 return rlst

def sent_has_name_at(sent,i):

 if not sent: return 0
 if i>=len(sent): return 0
 for known in nertable:
   phrase=known[0]
   j=0
   while j<len(phrase):
     if i+j>=len(sent): break
     if sent[i+j]!=phrase[j]:
       break
     j+=1
   if j==len(phrase):
     res=[known,i+len(phrase)-1]
     return res


def pos_text(slst):

 rlst=[]
 for sent in slst:
   srlst=[]
   i=0
   while i<len(sent):
     if type(sent[i])==type([0]): 
       srlst.append(sent[i])
       i+=1
       continue
     tmp=sent_has_pos_at(sent,i)
     if tmp:
       srlst.append(tmp[0])
       i=tmp[1]
     else:
       srlst.append(sent[i])
     i+=1  
   rlst.append(srlst)
 return rlst

def sent_has_pos_at(sent,i):

 if not sent: return 0
 if i>=len(sent): return 0
 for known in postable:
   phrase=known[0]
   j=0
   while j<len(phrase):
     if i+j>=len(sent): break
     if sent[i+j]!=phrase[j] and sent[i+j].lower()!=phrase[j]:
       break
     j+=1
   if j==len(phrase):
     res=[known,i+len(phrase)-1]
     return res

def split_text(txt):

 sentlst=txt.replace(","," ").split(".")
 wlst=[]
 for s in sentlst:
   if not s: continue
   sp=s.replace("."," ").replace("\n"," ").split(" ")
   tmp=[]
   for w in sp:
     w1=w.strip()
     if w1: tmp.append(w1)      
   wlst.append(tmp)
 return wlst

def pretty_print(sentlst):

 for sent in sentlst:
   print("sentence: ")
   if type(sent)==type([1]):
     for phrase in sent:
       print("  "+str(phrase)) 

def simple_rdf(sentlst):

 done=[]
 prevsent=None
 for sent in sentlst:
   ns=simple_rdf_sentence(sent,prevsent)
   if ns: 
     done+=ns
   prevsent=sent
 return done    

def simple_rdf_sentence(sent,prevsent):

 verbs=[]
 adverbs=[]  
 nouns=[]
 adjectives=[]  
 for phrase in sent:
   if type(phrase)!=type([1]): continue
   if phrase[2]=="verb":
     verbs.append(phrase)
   elif phrase[2]=="adverb":
     adverbs.append(phrase)  
   elif phrase[2] in ["ner_noun","noun"]:
     nouns.append(phrase)      
   elif phrase[2] in ["pronoun"]:
     candidates=get_candidate_nouns(prevsent)
     if candidates and len(candidates)==1:
       nouns.append(candidates[0])   
     elif candidates:
       nouns.append(candidates)        
   elif phrase[2] in ["adjective"]:
     adjectives.append(phrase)    
 if verbs and len(nouns)>1:
   
   print("nouns: "+str(nouns))
   print("verbs: "+str(verbs))
   print("adverbs: "+str(adverbs))
   print("adjectives: "+str(adjectives))
   
   if adverbs:
     lid=create_local_id()
     rdf=[[nouns[0][3],"id:action",lid],
          [lid,"id:isactivity",verbs[0][3]],  
          [lid,"id:extrainfo",adverbs[0][3]] ]
   elif adjectives:
     lid=create_local_id()
     rdf=[[nouns[0][3],verbs[0][3],lid],
          [lid,"id:isobject",nouns[1][3]],  
          [lid,"id:extrainfo",adjectives[0][3]] ]       
   else:
     print(str(["nouns",nouns[0],"verbs",verbs[0],"nouns",nouns[1]]))
     print("n:"+str(nouns[0]))
     print("v:"+str(verbs[0]))
     print("n:"+str(nouns[1]))
     if type(nouns[0])=[
     rdf=[[nouns[0][3],verbs[0][3],nouns[1][3]]]      
     if type(nouns[0])==type([1]) and len(nouns[0])>3 and nouns[0][4]:
       rdf.append([nouns[0][3],"rdf:type",nouns[0][4]])
     if type(nouns[0])==type([1]) and len(nouns[1])>3 and nouns[1][4]:
       rdf.append([nouns[1][3],"rdf:type",nouns[1][4]])		
 else:
   rdf=None    
 return rdf

def get_candidate_nouns(sent):

 lst=[]
 for phrase in sent:
   if phrase[2] in ["ner_noun","noun"]:
     lst.append(phrase)
 return lst

def create_local_id():

 global idnum
 idnum+=1
 return "id:local_"+str(idnum)
 

main(intxt)