# -*- coding: utf-8 -*- #Copyright 2021 P. Milizia CC-BY 4.0 (https://creativecommons.org/licenses/by/4.0/) import re import glob #CLASSE PYTHON PER LA LETTURA DI UN TREEBANK UD class UD: def __init__(self, dir_name: str = '.'): #legge i file files=glob.glob(dir_name+'/*conllu') data='' for filepath in files: with open(filepath, 'r') as file: data+=file.read() #ordina i file e crea una variabile text_list con l'elenco dei testi data=data.split('# newdoc ')[1:] data.sort() data='# newdoc '+'# newdoc '.join(data) self.text_list=re.findall('#\snewdoc\s.*', data) #crea le variabili fondamentali self.data=self.add_sent_id(data) self.tokens=self.create_tokens(self.data) self.sentences=self.create_sentences(self.tokens) #aggiunge il campo sent_id def add_sent_id(self, text): sid_expr=re.compile('(?<=sent_id\s=\s).*') text=re.split('\n',text) output='' sid='' for line in text: found=re.search(sid_expr,line) if found != None: sid=found.group() if line !='' and line[0] != '#': line = line+'\t'+sid output += line+'\n' return output #crea la lista dei tokens def create_tokens(self, data): expr=re.compile('\n([0-9]*?)\t(.*?)\t(.*?)\t(.*?)\t(.*?)\t(.*?)\t([0-9]*?)\t(.*?)\t(.*?)\t(.*?)\t(.*)') tokens = re.findall(expr, data) for i in range(len(tokens)): tokens[i]=list(tokens[i]) for ii in (5,9): tokens[i][ii]=re.split('\|',tokens[i][ii]) tokens[i][0]=int(tokens[i][0]) tokens[i][6]=int(tokens[i][6]) return tokens #crea il dizionario delle frasi def create_sentences(self, tokens): sentences={} sid='' for t in tokens: if t[10] != sid: sid=t[10] sentences[sid]=['root'] sentences[sid].append(t) return sentences #funzioni di interrogazione def parent_of(self, x): return self.sentences[x[10]][x[6]] def grandparent_of(self, occurrence): return self.parent_of(self.parent_of(occurrence)) def right_of(self, x, n: int = 1): try: outp=self.sentences[x[10]][x[0]+n] except IndexError: outp=None return outp def left_of(self, x, n: int = 1): if x[0]-n > 0: outp=self.sentences[x[10]][x[0]-n] else: outp=None return outp def children_of(self,x): return [t for t in self.sentences[x[10]][1:] if self.parent_of(t)==x] def lineage_of(self, x): output = [x] while x != 'root': y=self.parent_of(x) output.append(y) x=y return output #funzione per estrarre il testo di una frase def get_sentence(self, x): return [j[1] for j in self.sentences[x][1:]] #ESEMPIO DI UTILIZZO #trova tutte le occorrenze di 'ⲙⲙⲟ' e di 'ⲛ' come marca dell'oggetto diretto #alle seguenti condizioni: #1) che la frase sia al passato perfettivo positivo #2) che almeno un elemento sia interposto tra verbo e preposizione coptic=UD('testi/UD_Coptic-Scriptorium-master') found_tokens=[] def is_past(o): answer = False for child in coptic.children_of(o): if child[1]=='ⲁ' and child[3]=='AUX': answer = True break return answer for token in coptic.tokens: if (token[1] == 'ⲙⲙⲟ' or token[1] == 'ⲛ') and token[7]=='case': try: regens = coptic.grandparent_of(token) noun_pronoun = coptic.parent_of(token) if noun_pronoun[7]=='obj' and is_past(regens) and (token[0] - regens[0])>1: found_tokens.append(token) except: pass for o in found_tokens: print(o)