# -*- coding: utf-8 -*- import os,re,codecs input_path = '/Users/perrier/recherche/deep-sequoia/trunk/sequoia_surf.conll' output_path = '/Users/perrier/recherche/GREW_resources/parsing/experiment/lexicon.lp' def find_adj_dep(i,s,lex): e=s[i] dep_lemma = e["lemma"] gov = e["gov"] if gov <= i and e["funct"] <> u"dep.coord": gov_entry = sent[gov-1] if gov_entry["cat"] in [u"A", u"N", u"PRO", u"V"]: gov_lemma= gov_entry["lemma"] if lex.has_key((gov_lemma,dep_lemma)): lex[gov_lemma,dep_lemma] +=1 else: lex[gov_lemma,dep_lemma] =1 return lex def find_app_noun(i,s,lex): e=s[i] app_noun = e["lemma"] gov = e["gov"] if gov <= i and e["funct"] == u"mod.app": gov_entry = sent[gov-1] noun= gov_entry["lemma"] if lex.has_key((noun,app_noun)): lex[noun,app_noun] +=1 else: lex[noun,app_noun] =1 return lex def find_coord(i,s,lex): e=s[i] conj2 = e["lemma"] gov = e["gov"] gov_entry = sent[gov-1] coord = gov_entry["lemma"] if gov_entry["funct"] == u"coord": if coord not in[ "ni","soit"]: gov = gov_entry["gov"] gov_entry = sent[gov-1] conj1 = gov_entry["lemma"] if lex.has_key((conj1,conj2)): lex[conj1,conj2] +=1 else: lex[conj1,conj2] =1 return lex def find_cpl_dep(i,s,lex): e=s[i] dep = e["lemma"] gov = e["gov"] gov_entry = sent[gov-1] if gov_entry["pos"] == u"CS": cpl = gov_entry["lemma"] if gov_entry["funct"] <> u'dep.coord': gov = gov_entry["gov"] gov_entry = sent[gov-1] gov = gov_entry["lemma"] if lex.has_key((gov,cpl,dep)): lex[gov,cpl,dep] +=1 else: lex[gov,cpl,dep] =1 return lex def find_prep_dep(i,s,lex): e=s[i] noun = e["lemma"] gov = e["gov"] gov_entry = sent[gov-1] if gov_entry["cat"] == u"P" or gov_entry["cat"] == u"P+D": prep = gov_entry["lemma"] gov = gov_entry["gov"] gov_entry = sent[gov-1] if gov_entry["cat"] in [u"A", u"ADV", u"N", u"PRO", u"V"]: gov_lemma= gov_entry["lemma"] if lex.has_key((gov_lemma,prep,noun)): lex[gov_lemma,prep,noun] +=1 else: lex[gov_lemma,prep,noun] =1 return lex def find_adv_dep(i,s,lex): e=s[i] adv = e["lemma"] gov = e["gov"] if gov <= i and e["funct"] <> u"dep.coord": gov_entry = sent[gov-1] govern= gov_entry["lemma"] if lex.has_key((govern,adv)): lex[govern,adv] +=1 else: lex[govern,adv] =1 return lex finput=codecs.open(input_path, mode='r', encoding='utf-8') foutput=codecs.open(output_path, mode='w', encoding='utf-8') corpus = finput.readlines() finput.close() lcorpus = len(corpus) n=0 lexicon ={} cat = raw_input("adj, appnoun, coord, cpl, prepverb, prepnoun, verbadv ? ") while n < lcorpus: l= corpus[n] #print l sent=[] while l <> '\n' and l[0] <> u'#': lentry = l.split('\t') #print lentry entry = { "lemma":lentry[2], "cat":lentry[3], "pos":lentry[4], "gov":int(lentry[6]), "funct":lentry[7]} #print entry sent.append(entry) n +=1 l = corpus[n] lsent=len(sent) for i in range(lsent): entry = sent[i] if cat == "adj": if entry["cat"]== u'A' or entry["pos"]== u"VPP" or entry["pos"]== u"VPR": lexicon = find_adj_dep(i,sent,lexicon) if cat == "appnoun": if entry["cat"]== u'N' or entry["cat"]== u"PRO": lexicon = find_app_noun(i,sent,lexicon) elif cat == "coord": if entry["funct"] == u'dep.coord': lexicon = find_coord(i,sent,lexicon) elif cat == "cpl": if entry["cat"] == u'V': lexicon = find_cpl_dep(i,sent,lexicon) elif cat == "prepverb": if entry["pos"] == u'VINF' or entry["pos"] == u'VPR': lexicon = find_prep_dep(i,sent,lexicon) elif cat == "prepnoun": if entry["cat"] == u'N' or entry["cat"] == u'A': lexicon = find_prep_dep(i,sent,lexicon) if cat == "adv": if entry["cat"]== u'ADV': lexicon = find_adv_dep(i,sent,lexicon) n += 1 if cat == "adj": for (gov,dep) in lexicon: foutput.write(gov +"#"+ dep +"##" + unicode(str(lexicon[gov,dep]),"utf-8") +"\n") elif cat == "coord": for (conj1,conj2) in lexicon: foutput.write(conj1 +"#"+ conj2 +"##" + unicode(str(lexicon[conj1,conj2]),"utf-8") +"\n") elif cat == "appnoun": for (noun,appnoun) in lexicon: foutput.write(noun +"#"+ appnoun +"##" + unicode(str(lexicon[noun,appnoun]),"utf-8") +"\n") elif cat == "cpl": for (gov,cpl,dep) in lexicon: foutput.write(gov +"#"+ cpl +"#"+ dep +"##" + unicode(str(lexicon[gov,cpl,dep]),"utf-8") +"\n") elif cat == "prepnoun" or cat == "prepverb": for (gov,prep, dep) in lexicon: foutput.write(gov +"#"+ prep +"#"+ dep +"##" + unicode(str(lexicon[gov,prep,dep]),"utf-8") +"\n") elif cat == "adv": for (gov,adv) in lexicon: foutput.write(gov +"#"+ adv +"##" + unicode(str(lexicon[gov,adv]),"utf-8") +"\n") foutput.close()