#!/bin/env python3 #-*- coding: utf8 -*- """ This file contains funtions to tag a sentence with the TreeTagger and to convert a tagged sentence to a format for our application. """ #PYTHON PACKAGES IMPORTS import re #LOCAL IMPORT import sys import os from src import treetagger as tt THIS_DIR = os.path.dirname(os.path.realpath(__file__)) sys.path.append(os.path.dirname(THIS_DIR)) #TreeTagger french tags UNTREATED_TAGS = ['ABR', 'ADJ', 'ADV', 'DET:ART', 'INT', 'KON', 'PUN', 'NOM', 'NUM', 'PRO', 'PRO:DEM', 'PRO:IND', 'PRO:REL', 'PRP', 'PRP:det', 'PUN', 'PUN:cit', 'SENT', 'SYM', 'VER:infi', 'VER:pper', 'VER:ppre'] ENTITY_NAME = ['NAM'] DET_POS = ['DET:POS'] PRONOM_PERS = ['PRO:PER'] PRONOM_POSSESSIF = ['PRO:POS'] VERB = ['VER:cond', 'VER:futu', 'VER:impe', 'VER:impf', 'VER:pres', 'VER:simp', 'VER:subi', 'VER:subp'] END_SENT = ['END'] def split(string, num): """ A function that split a string in a list of n characters string Arguments: string {str} -- the text we want to cut num {int} -- the length of the string at the output Returns: [list] -- Returns a list of string """ new_list = [] for i in range(0, len(string), num): new_list.append(string[i:i+num]) return new_list def tagging_text(text, subject, article_file, path="./TreeTagger", lang="french"): """ A function that call that transform a text in a tagged text by TreeTagger Arguments: text {string} -- the text we want to tag subject {type} -- the title of the original wikipedia page article_file {file} -- the report file Keyword Arguments: path {str} -- the path to the TreeTagger (optionnal) (default: {"./TreeTagger"}) lang {str} -- the language of the text (optionnal) (default: {"french"}) Returns: [list] -- Returns the tagged text """ text = cleaning_sentence(text, subject) article_file.write("----- Cleaning sentence : -----\n\n"+text+"\n\n") tttag = tt.TreeTagger(path_to_treetagger=path, language=lang) tag = tttag.tag(text) article_file.write("----- Tagging sentence : -----\n\n") tagged_txt = ''.join('('+str(part[0])+','+str(part[1])+','+str(part[2])+')' for part in tag) #Divise le string en string de 2096 char #pour pas avoir de problèmes de longueur de ligne avec le fichier for part in split(tagged_txt, 2096): article_file.write(str(part)+'\n') article_file.write('\n') article_file.write("----- Transform sentences : -----\n\n") return tag #def separe_sentences(text, subject, article_file, path="./TreeTagger", lang="french"): def separe_sentences(text, subject, article_file): """ A function that call tagging_text() function and split the tagged sentences Arguments: text {string} -- the text we want to tag subject {string} -- the title of the original wikipedia page article_file {file} -- the report file Keyword Arguments: path {str} -- the path to the TreeTagger (optionnal) (default: {"./TreeTagger"}) lang {str} -- the language of the text (optionnal) (default: {"french"}) Returns: [list] -- Returns the list of sentences """ sentence_list = [] current_sent = [] for i in tagging_text(text, subject, article_file): if i[1] != 'SENT': current_sent.append(i) else: current_sent.append(i) sentence_list.append(current_sent) current_sent = [] return sentence_list def add_mod_stce(temp_saved_tag, current_group, mod_sentence): """A function to shorten transform_sentence""" if temp_saved_tag != '': mod_sentence.append((temp_saved_tag, current_group)) def transform_sentence(sentence, article_file): """ A function that transform tagged text in an other tagging more simple Arguments: sentence {list} -- the tagged text that we will transform file {file} -- the report file Returns: [list] -- Returns the new tagged text """ sentence.append(['END', 'END', 'END']) mod_sentence = [] temp_saved_tag = '' last_word = '' current_group = '' for word in sentence: move_word = False if word[1] in UNTREATED_TAGS: if last_word not in UNTREATED_TAGS: add_mod_stce(temp_saved_tag, current_group, mod_sentence) current_group, temp_saved_tag = '', 'A' move_word = True elif word[1] in ENTITY_NAME: if last_word not in ENTITY_NAME: add_mod_stce(temp_saved_tag, current_group, mod_sentence) current_group, temp_saved_tag = '', 'S' move_word = True elif word[1] in DET_POS: if last_word not in DET_POS: add_mod_stce(temp_saved_tag, current_group, mod_sentence) current_group, temp_saved_tag = '', 'D:POS' move_word = True elif word[1] in PRONOM_PERS: if last_word not in PRONOM_PERS: add_mod_stce(temp_saved_tag, current_group, mod_sentence) current_group, temp_saved_tag = '', 'P:PERS' move_word = True elif word[1] in PRONOM_POSSESSIF: if last_word not in PRONOM_POSSESSIF: add_mod_stce(temp_saved_tag, current_group, mod_sentence) current_group, temp_saved_tag = '', 'P:POS' move_word = True elif word[1] in VERB: if last_word not in VERB: add_mod_stce(temp_saved_tag, current_group, mod_sentence) current_group, temp_saved_tag = '', 'V' move_word = True elif word[1] in END_SENT: if last_word not in END_SENT: add_mod_stce(temp_saved_tag, current_group, mod_sentence) else: move_word = True if move_word: last_word = word[1] current_group += word[0] + " " check_subj(mod_sentence, article_file.name[:article_file.name.find('.')]) transform_sent = ''.join('(\"'+str(part[0])+'\",\"'+str(part[1])+'\"),' for part in mod_sentence) #Divise le string en string de 2096 char #pour pas avoir de problèmes de longueur de ligne avec le fichier for part in split(transform_sent, 2096): article_file.write(str(part)+'\n') return clear_sentence(mod_sentence) def check_entities(sentence, entity): """ A function that transform tagged group where tag is 'S' in tagged group where tag is 'A' where they don't contain the subject entity Arguments: sentence {list} -- A sentence of new tagged groups entity {string} -- the name of the original wikipedia page Returns: [list] -- Returns the modified sentence """ for idx, group in enumerate(sentence): if group[0] == 'S' and not is_subj(group, entity): sentence[idx] = ('A', group[1]) return sentence def check_subj(sentence, subject): """ A function that modifies a tagged sentence to tag a non-subject as subject if it contains the subject. Arguments: sentence {list} -- sentence as a list of tagged groups subject {str} -- subject to check """ for idx, part in enumerate(sentence): if part[0] != 'S' and is_subj(part, subject): sentence[idx] = ('S', subject) def is_subj(part, subject): """ Returns whether the part checked contains any part of the subject. """ subject = subject.split() for subj in subject: if subj in part[1]: return True return False def clear_sentence(sentence): """ A function that delete useless spaces Arguments: sentence {list} -- A sentence to be cleaned Returns: [list] -- Returns the cleaned sentence """ return [(word[0], word[1].rstrip()) for word in sentence] def del_phonetic(sentence): """ A function that delete wikipedia phonetics informations Arguments: sentence {list} -- A sentence to be cleaned Returns: [list] -- Returns the cleaned sentence """ return re.sub(r'\[.*?\]', '', sentence) def del_parenthesis_content(sentence): """ A function that delete parenthesis content Arguments: sentence {list} -- A sentence to be cleaned Returns: [list] -- Returns the cleaned sentence """ return re.sub(r'\(.*?\)', '', sentence) def clear_multiple_spaces(sentence): """ A function that transform multiple spaces in one space Arguments: sentence {list} -- A sentence to be cleaned Returns: [list] -- Returns the cleaned sentence """ return re.sub(r'\s+', ' ', sentence) def clean_born(sentence): """ A function that catch born sentence french format of wikipedia and return a modified sentence Arguments: sentence {list} -- A sentence to be modified Returns: [list] -- Returns the modified sentence """ sentence = re.sub(r'(,)\s(né )', ' est né ', sentence) sentence = re.sub(r'(,)\s(née )', ' est née ', sentence) sentence = re.sub(r'(,)\s(nés )', ' sont nés ', sentence) sentence = re.sub(r'(,)\s(nées )', ' sont nées ', sentence) return sentence def clean_say(sentence, subject): """ A function that catch says sentence french format of wikipedia and return a modified sentence Arguments: sentence {list} -- A sentence to be modified subject {string} -- the subject of the original wikipedia page Returns: [list] -- Returns the modified sentence """ sentence = re.sub(r'^(.)*(,)\s(dit)\s'+subject+'(,)', subject, sentence) return sentence def clean_or_simply(sentence): """ A function that catch "ou simplement" sentence french format of wikipedia and return a modified sentence Arguments: sentence {list} -- A sentence to be modified Returns: [list] -- [Returns the modified sentence] """ return re.sub(r'\,\s(ou simplement)([a-zA-Z]|\s)*\,', ',', sentence) def clean_alias(sentence, subject): """ A function that catch alias sentence french format of wikipedia and return a modified sentence Arguments: sentence {list} -- A sentence to be modified Returns: [list] -- [Returns the modified sentence] """ sentence = re.sub(r'(([a-zA-Z]|\s)*)\,\s(alias)\s('+subject+r')\,', r'Il est connu sous le nom de \1,', sentence) return sentence def clean_isolate_etre_verb(sentence, subject): """ A function that catch isolated verbs and add before them the subject Arguments: sentence {list} -- A sentence to be modified subject {string} -- the subject of the original wikipedia page Returns: [list] -- Returns the modified sentence """ sentence = re.sub(r'(,)\s(est)', ', ' + subject + ' est', sentence) sentence = re.sub(r'(,)\s(sont)', ', ' + subject + ' sont', sentence) return sentence def clean_poss(sentence, subject): """ A function that catch possessive group Arguments: sentence {list} -- A sentence to be modified subject {string} -- the subject of the original wikipedia page Returns: [list] -- Returns the modified sentence """ sentence = re.sub(r''+subject+r'\s(et ses)\s(.*)', r'moi et ses \2', sentence) sentence = re.sub(r''+subject+r'\s(et leurs)\s(.*)', r'moi et leurs \2', sentence) return sentence def cleaning_sentence(sentence, subject): """ A function that call cleaning functions Arguments: sentence {list} -- A sentence to be modified subject {string} -- the subject of the original wikipedia page Returns: [list] -- Returns the modified sentence """ #del useless content sentence = del_phonetic(sentence) sentence = del_parenthesis_content(sentence) #Clear and format content sentence = clean_alias(sentence, subject) sentence = clean_or_simply(sentence) sentence = clean_born(sentence) sentence = clean_isolate_etre_verb(sentence, subject) sentence = clean_say(sentence, subject) sentence = clean_poss(sentence, subject) sentence = clear_multiple_spaces(sentence) return sentence def __test(): """ test function of tagging.py """ # print(transform_sentence(tagging_text( # "./TreeTagger", # "french", # """Harry Potter est une série littéraire de fantasy écrite par \ #l'auteure britannique J. K. Rowling, dont la suite romanesque s'est \ #achevée en 2007."""))) # print(transform_sentence(tagging_text(cleaning_sentence( # "C'était le leur ", "Bob"), "Bob"))) # print(separe_sentences("Harry a tué Voldemort. Ron aime Hermione.")) # print(cleaning_sentence( # """Jacques Chirac [ ʒɑk ʃiʁak] , né le 29 novembre 1932 à Paris (Ve), #est un haut fonctionnaire et homme d'État français.""", # "Jacques Chirac")) # print(clean_say("""Jean-Baptiste Poquelin, dit Molière, est un comédien \ #et dramaturge français, baptisé le 15 janvier 1622 à Paris, où il est mort \ #le 17 février 1673. """, "Molière")) if __name__ == '__main__': __test()