#!/bin/env python3
#-*- coding: utf8 -*-

"""
This file contains funtions to tag a sentence with the TreeTagger
and to convert a tagged sentence to a format for our application.
"""

#PYTHON PACKAGES IMPORTS
import re

#LOCAL IMPORT
import sys
import os
from src import treetagger as tt

THIS_DIR = os.path.dirname(os.path.realpath(__file__))
sys.path.append(os.path.dirname(THIS_DIR))

#TreeTagger french tags
UNTREATED_TAGS = ['ABR', 'ADJ', 'ADV', 'DET:ART', 'INT', 'KON', 'PUN', 'NOM',
                  'NUM', 'PRO', 'PRO:DEM', 'PRO:IND', 'PRO:REL', 'PRP',
                  'PRP:det', 'PUN', 'PUN:cit', 'SENT', 'SYM', 'VER:infi',
                  'VER:pper', 'VER:ppre']
ENTITY_NAME = ['NAM']
DET_POS = ['DET:POS']
PRONOM_PERS = ['PRO:PER']
PRONOM_POSSESSIF = ['PRO:POS']
VERB = ['VER:cond', 'VER:futu', 'VER:impe', 'VER:impf', 'VER:pres',
        'VER:simp', 'VER:subi', 'VER:subp']
END_SENT = ['END']


def split(string, num):
    """
    A function that split a string in a list of n characters string

    Arguments:
        string {str} -- the text we want to cut
        num {int} -- the length of the string at the output

    Returns:
        [list] -- Returns a list of string
    """

    new_list = []
    for i in range(0, len(string), num):
        new_list.append(string[i:i+num])
    return new_list

def tagging_text(text, subject, article_file, path="./TreeTagger", lang="french"):
    """
    A function that call that transform a text in a tagged text by TreeTagger

    Arguments:
        text {string} -- the text we want to tag
        subject {type} -- the title of the original wikipedia page
        article_file {file} -- the report file

    Keyword Arguments:
        path {str} -- the path to the TreeTagger (optionnal) (default: {"./TreeTagger"})
        lang {str} -- the language of the text (optionnal) (default: {"french"})

    Returns:
        [list] -- Returns the tagged text
    """
    text = cleaning_sentence(text, subject)
    article_file.write("----- Cleaning sentence : -----\n\n"+text+"\n\n")
    tttag = tt.TreeTagger(path_to_treetagger=path, language=lang)
    tag = tttag.tag(text)
    article_file.write("----- Tagging sentence : -----\n\n")
    tagged_txt = ''.join('('+str(part[0])+','+str(part[1])+','+str(part[2])+')'
                         for part in tag)
    #Divise le string en string de 2096 char
    #pour pas avoir de problèmes de longueur de ligne avec le fichier
    for part in split(tagged_txt, 2096):
        article_file.write(str(part)+'\n')
    article_file.write('\n')
    article_file.write("----- Transform sentences : -----\n\n")
    return tag

#def separe_sentences(text, subject, article_file, path="./TreeTagger", lang="french"):
def separe_sentences(text, subject, article_file):
    """
    A function that call tagging_text() function and split the tagged sentences

    Arguments:
        text {string} -- the text we want to tag
        subject {string} -- the title of the original wikipedia page
        article_file {file} -- the report file

    Keyword Arguments:
        path {str} -- the path to the TreeTagger (optionnal) (default: {"./TreeTagger"})
        lang {str} -- the language of the text (optionnal) (default: {"french"})

    Returns:
        [list] -- Returns the list of sentences
    """
    sentence_list = []
    current_sent = []
    for i in tagging_text(text, subject, article_file):
        if i[1] != 'SENT':
            current_sent.append(i)
        else:
            current_sent.append(i)
            sentence_list.append(current_sent)
            current_sent = []
    return sentence_list

def add_mod_stce(temp_saved_tag, current_group, mod_sentence):
    """A function to shorten transform_sentence"""
    if temp_saved_tag != '':
        mod_sentence.append((temp_saved_tag, current_group))

def transform_sentence(sentence, article_file):
    """
    A function that transform tagged text in an other tagging more simple

    Arguments:
        sentence {list} -- the tagged text that we will transform
        file {file} -- the report file

    Returns:
        [list] -- Returns the new tagged text
    """
    sentence.append(['END', 'END', 'END'])
    mod_sentence = []
    temp_saved_tag = ''
    last_word = ''
    current_group = ''
    for word in sentence:
        move_word = False
        if word[1] in UNTREATED_TAGS:
            if last_word not in UNTREATED_TAGS:
                add_mod_stce(temp_saved_tag, current_group, mod_sentence)
                current_group, temp_saved_tag = '', 'A'
            move_word = True
        elif word[1] in ENTITY_NAME:
            if last_word not in ENTITY_NAME:
                add_mod_stce(temp_saved_tag, current_group, mod_sentence)
                current_group, temp_saved_tag = '', 'S'
            move_word = True
        elif word[1] in DET_POS:
            if last_word not in DET_POS:
                add_mod_stce(temp_saved_tag, current_group, mod_sentence)
                current_group, temp_saved_tag = '', 'D:POS'
            move_word = True
        elif word[1] in PRONOM_PERS:
            if last_word not in PRONOM_PERS:
                add_mod_stce(temp_saved_tag, current_group, mod_sentence)
                current_group, temp_saved_tag = '', 'P:PERS'
            move_word = True
        elif word[1] in PRONOM_POSSESSIF:
            if last_word not in PRONOM_POSSESSIF:
                add_mod_stce(temp_saved_tag, current_group, mod_sentence)
                current_group, temp_saved_tag = '', 'P:POS'
            move_word = True
        elif word[1] in VERB:
            if last_word not in VERB:
                add_mod_stce(temp_saved_tag, current_group, mod_sentence)
                current_group, temp_saved_tag = '', 'V'
            move_word = True
        elif word[1] in END_SENT:
            if last_word not in END_SENT:
                add_mod_stce(temp_saved_tag, current_group, mod_sentence)
        else:
            move_word = True
        if move_word:
            last_word = word[1]
            current_group += word[0] + " "

    check_subj(mod_sentence, article_file.name[:article_file.name.find('.')])

    transform_sent = ''.join('(\"'+str(part[0])+'\",\"'+str(part[1])+'\"),'
                             for part in mod_sentence)
    #Divise le string en string de 2096 char
    #pour pas avoir de problèmes de longueur de ligne avec le fichier
    for part in split(transform_sent, 2096):
        article_file.write(str(part)+'\n')
    return clear_sentence(mod_sentence)

def check_entities(sentence, entity):
    """
    A function that transform tagged group where tag is 'S'
    in tagged group where tag is 'A' where they don't contain the subject entity

    Arguments:
        sentence {list} -- A sentence of new tagged groups
        entity {string} -- the name of the original wikipedia page

    Returns:
        [list] -- Returns the modified sentence
    """
    for idx, group in enumerate(sentence):
        if group[0] == 'S' and not is_subj(group, entity):
            sentence[idx] = ('A', group[1])
    return sentence

def check_subj(sentence, subject):
    """
    A function that modifies a tagged sentence to tag a non-subject
    as subject if it contains the subject.

    Arguments:
        sentence {list} -- sentence as a list of tagged groups
        subject {str} -- subject to check
    """
    for idx, part in enumerate(sentence):
        if part[0] != 'S' and is_subj(part, subject):
            sentence[idx] = ('S', subject)

def is_subj(part, subject):
    """
    Returns whether the part checked contains any part of the subject.
    """
    subject = subject.split()
    for subj in subject:
        if subj in part[1]:
            return True
    return False

def clear_sentence(sentence):
    """
    A function that delete useless spaces

    Arguments:
        sentence {list} -- A sentence to be cleaned

    Returns:
        [list] -- Returns the cleaned sentence
    """

    return [(word[0], word[1].rstrip()) for word in sentence]

def del_phonetic(sentence):
    """
    A function that delete wikipedia phonetics informations

    Arguments:
        sentence {list} -- A sentence to be cleaned

    Returns:
        [list] -- Returns the cleaned sentence
    """

    return re.sub(r'\[.*?\]', '', sentence)

def del_parenthesis_content(sentence):
    """
    A function that delete parenthesis content

    Arguments:
        sentence {list} -- A sentence to be cleaned

    Returns:
        [list] -- Returns the cleaned sentence
    """
    return re.sub(r'\(.*?\)', '', sentence)

def clear_multiple_spaces(sentence):
    """
    A function that transform multiple spaces in one space

    Arguments:
        sentence {list} -- A sentence to be cleaned

    Returns:
        [list] -- Returns the cleaned sentence
    """
    return re.sub(r'\s+', ' ', sentence)

def clean_born(sentence):
    """
    A function that catch born sentence french format of wikipedia and
    return a modified sentence

    Arguments:
        sentence {list} -- A sentence to be modified

    Returns:
        [list] -- Returns the modified sentence
    """
    sentence = re.sub(r'(,)\s(né )', ' est né ', sentence)
    sentence = re.sub(r'(,)\s(née )', ' est née ', sentence)
    sentence = re.sub(r'(,)\s(nés )', ' sont nés ', sentence)
    sentence = re.sub(r'(,)\s(nées )', ' sont nées ', sentence)
    return sentence

def clean_say(sentence, subject):
    """
    A function that catch says sentence french format of wikipedia and
    return a modified sentence

    Arguments:
        sentence {list} -- A sentence to be modified
        subject {string} -- the subject of the original wikipedia page

    Returns:
        [list] -- Returns the modified sentence
    """
    sentence = re.sub(r'^(.)*(,)\s(dit)\s'+subject+'(,)', subject, sentence)
    return sentence

def clean_or_simply(sentence):
    """
    A function that catch "ou simplement" sentence french format of wikipedia and
    return a modified sentence

    Arguments:
        sentence {list} -- A sentence to be modified

    Returns:
        [list] -- [Returns the modified sentence]
    """
    return re.sub(r'\,\s(ou simplement)([a-zA-Z]|\s)*\,', ',', sentence)

def clean_alias(sentence, subject):
    """
    A function that catch alias sentence french format of wikipedia and
    return a modified sentence

    Arguments:
        sentence {list} -- A sentence to be modified

    Returns:
        [list] -- [Returns the modified sentence]
    """
    sentence = re.sub(r'(([a-zA-Z]|\s)*)\,\s(alias)\s('+subject+r')\,',
                      r'Il est connu sous le nom de \1,', sentence)
    return sentence

def clean_isolate_etre_verb(sentence, subject):
    """
    A function that catch isolated verbs and add before them the subject

    Arguments:
        sentence {list} -- A sentence to be modified
        subject {string} -- the subject of the original wikipedia page

    Returns:
        [list] -- Returns the modified sentence
    """
    sentence = re.sub(r'(,)\s(est)', ', ' + subject + ' est', sentence)
    sentence = re.sub(r'(,)\s(sont)', ', ' + subject + ' sont', sentence)
    return sentence

def clean_poss(sentence, subject):
    """
    A function that catch possessive group

    Arguments:
        sentence {list} -- A sentence to be modified
        subject {string} -- the subject of the original wikipedia page

    Returns:
        [list] -- Returns the modified sentence
    """
    sentence = re.sub(r''+subject+r'\s(et ses)\s(.*)',
                      r'moi et ses \2', sentence)
    sentence = re.sub(r''+subject+r'\s(et leurs)\s(.*)',
                      r'moi et leurs \2', sentence)
    return sentence

def cleaning_sentence(sentence, subject):
    """
    A function that call cleaning functions

    Arguments:
        sentence {list} -- A sentence to be modified
        subject {string} -- the subject of the original wikipedia page

    Returns:
        [list] -- Returns the modified sentence
    """
    #del useless content
    sentence = del_phonetic(sentence)
    sentence = del_parenthesis_content(sentence)

    #Clear and format content
    sentence = clean_alias(sentence, subject)
    sentence = clean_or_simply(sentence)
    sentence = clean_born(sentence)
    sentence = clean_isolate_etre_verb(sentence, subject)
    sentence = clean_say(sentence, subject)
    sentence = clean_poss(sentence, subject)

    sentence = clear_multiple_spaces(sentence)
    return sentence

def __test():
    """
    test function of tagging.py
    """
#    print(transform_sentence(tagging_text(
#        "./TreeTagger",
#        "french",
#        """Harry Potter est une série littéraire de fantasy écrite par \
#l'auteure britannique J. K. Rowling, dont la suite romanesque s'est \
#achevée en 2007.""")))
#    print(transform_sentence(tagging_text(cleaning_sentence(
#        "C'était le leur    ", "Bob"), "Bob")))
#    print(separe_sentences("Harry a tué Voldemort. Ron aime Hermione."))
#    print(cleaning_sentence(
#        """Jacques Chirac [ ʒɑk ʃiʁak] , né le 29 novembre 1932 à Paris (Ve),
#est un haut fonctionnaire et homme d'État français.""",
#        "Jacques Chirac"))
#    print(clean_say("""Jean-Baptiste Poquelin, dit Molière, est un comédien \
#et dramaturge français, baptisé le 15 janvier 1622 à Paris, où il est mort \
#le 17 février 1673. """, "Molière"))

if __name__ == '__main__':
    __test()