Questa è una vecchia versione del documento!
Copiate nella cartella “testi” la cartella “UD_Coptic-Scriptorium-master” scaricabile in formato compresso da questo link (si veda il file LICENSE.txt per riferimenti e condizioni di utilizzo).
8 ⲛⲧⲟⲕ ⲛⲧⲟⲕ PRON PPERI Definite=Def|Gender=Masc|Number=Sing|Person=2|PronType=Prs 3 acl _ Orig=ⲛ̄ⲧⲟⲕ
# -*- coding: utf-8 -*- #Copyright 2021 P. Milizia CC-BY 4.0 (https://creativecommons.org/licenses/by/4.0/) import re import glob #CLASSE PYTHON PER LA LETTURA DI UN TREEBANK UD class UD: def __init__(self, data): self.data=self.add_sent_id(data) self.tokens=self.create_tokens(self.data) self.sentences=self.create_sentences(self.tokens) #aggiunge il campo sent_id def add_sent_id(self, text): sid_expr=re.compile('(?<=sent_id\s=\s).*') text=re.split('\n',text) output='' sid='' for line in text: found=re.search(sid_expr,line) if found != None: sid=found.group() if line !='' and line[0] != '#': line = line+'\t'+sid output += line+'\n' return output #crea la lista dei tokens def create_tokens(self, data): expr=re.compile('\n([^#].*?)\t(.*?)\t(.*?)\t(.*?)\t(.*?)\t(.*?)\t(.*?)\t(.*?)\t(.*?)\t(.*?)\t(.*)') tokens = re.findall(expr, data) for i in range(len(tokens)): tokens[i]=list(tokens[i]) for ii in (5,9): tokens[i][ii]=re.split('\|',tokens[i][ii]) try: tokens[i][0]=int(tokens[i][0]) except ValueError: pass try: tokens[i][6]=int(tokens[i][6]) except ValueError: pass return tokens #crea il dizionario delle frasi def create_sentences(self, tokens): sentences={} sid='' for t in tokens: if t[10] != sid: sid=t[10] sentences[sid]=['root'] if type(t[0])==int: tt=t[:] sentences[sid].append(tt) return sentences def parent_of(self, occurrence): x=self.sentences[occurrence[10]][occurrence[0]] return self.sentences[x[10]][x[6]] def grandparent_of(self, occurrence): return self.parent_of(self.parent_of(occurrence)) def subsequent_of(self, occurrence): try: outp=self.sentences[occurrence[10]][occurrence[0]+1] except IndexError: outp=None return outp def precedent_of(self, occurrence): try: outp=self.sentences[occurrence[10]][occurrence[0]-1] except IndexError: outp=None except TypeError: outp=None return outp def children_of(self,occurrence): x=self.sentences[occurrence[10]][occurrence[0]] return [t for t in self.sentences[occurrence[10]][1:] if self.parent_of(t)==x] def ancestors_of(self, occurrence): x=self.sentences[occurrence[10]][occurrence[0]] output = [x] while x != 'root': y=self.parent_of(x) output.append(y) x=y return output def get_sentence(self, x): return [j[1] for j in self.sentences[x][1:]] #PREPARAZIONE DEI DATI PER LA CLASSE #lettura file directory_name='testi/UD_Coptic-Scriptorium-master' path = directory_name+'/*conllu' files=glob.glob(path) data='' for filepath in files: with open(filepath, 'r') as file: file_data=file.read() data+=file_data #ordina i file e crea una variabile text_list con l'elenco dei testi data=data.split('# newdoc ')[1:] data.sort() data='# newdoc '+'# newdoc '.join(data) text_list=re.findall('#\snewdoc\s.*', data) #ESEMPIO DI UTILIZZO #trova tutte le occorrenze di 'ⲙⲙⲟ' e di 'ⲛ' come marca dell'oggetto diretto #alle seguenti condizioni: #1) che la frase sia al passato perfettivo positivo #2) che almeno un elemento sia interposto tra verbo e preposizione coptic=UD(data) found_tokens=[] def is_past(o): answer = False for child in coptic.children_of(o): if child[1]=='ⲁ' and child[3]=='AUX': answer = True break return answer for token in coptic.tokens: if (token[1] == 'ⲙⲙⲟ' or token[1] == 'ⲛ') and token[7]=='case': try: regens = coptic.grandparent_of(token) noun_pronoun = coptic.parent_of(token) if noun_pronoun[7]=='obj' and is_past(regens) and (int(token[0]) - int(regens[0]))>1: found_tokens.append(token) except: pass for o in found_tokens: print(o)