import os import numpy as np def load_from_directory(path): node_id_data_list = [] node_type_data_list = [] node_id = list() node_type = list() graph_type = list() none_file_path = "" for file_name in os.listdir(path): with open(path + "/" + file_name, 'r') as file: for line in file: print(line) if len(line.strip()) == 0: node_id_data_list.append([node_id, graph_type]) node_type_data_list.append([node_type, graph_type]) node_id = list() node_type = list() graph_type = list() elif len(line.split(' ')) == 3: graph_type.append([int(line.split(' ')[1])]) else: data = line.split(' ') node_id.append([int(data[0]), int(data[2]), int(data[3])]) node_type.append([int(data[1]), int(data[2]), int(data[4])]) return node_id_data_list, node_type_data_list def find_max_edge_id(data_list): max_edge_id = 0 for data in data_list: edges = data[0] for item in edges: if item[1] > max_edge_id: max_edge_id = item[1] return max_edge_id def find_max_node_id(data_list): max_node_id = 0 for data in data_list: edges = data[0] for item in edges: if item[0] > max_node_id: max_node_id = item[0] if item[2] > max_node_id: max_node_id = item[2] return max_node_id def convert_program_data(data_list, n_annotation_dim, n_nodes): # n_nodes = find_max_node_id(data_list) class_data_list = [] for item in data_list: edge_list = item[0] target_list = item[1] for target in target_list: task_type = target[0] task_output = target[-1] annotation = np.zeros([n_nodes, n_annotation_dim]) for edge in edge_list: src_idx = edge[0] if src_idx < len(annotation): annotation[src_idx - 1][0] = 1 class_data_list.append([edge_list, annotation, task_output]) return class_data_list def create_adjacency_matrix(edges, n_nodes, n_edge_types): a = np.zeros([n_nodes, n_nodes * n_edge_types * 2]) for edge in edges: src_idx = edge[0] e_type = edge[1] tgt_idx = edge[2] if tgt_idx < len(a): a[tgt_idx - 1][(e_type - 1) * n_nodes + src_idx - 1] = 1 if src_idx < len(a): a[src_idx - 1][(e_type - 1 + n_edge_types) * n_nodes + tgt_idx - 1] = 1 return a def create_embedding_matrix(node_id_edges, node_type_edges, n_nodes, n_types): anno = np.zeros([n_nodes, n_types]) for i in range(len(node_id_edges)): node_type = node_type_edges[i][0] # print(node_type) src_idx = node_id_edges[i][0] anno[src_idx - 1][node_type - 1] = 1.0 return anno class Dataset: """ Load bAbI tasks for GGNN """ def __init__(self, path, is_train): data_id, data_type = load_from_directory(path) self.n_edge_types = find_max_edge_id(data_id) max_node_id = find_max_node_id(data_id) max_node_type = find_max_node_id(data_type) self.n_node_by_id = max_node_id self.n_node_by_type = max_node_type self.node_by_id = convert_program_data(data_id, 1, self.n_node_by_id) self.node_by_type = convert_program_data(data_type, 1, self.n_node_by_type) def __getitem__(self, index): am = create_adjacency_matrix(self.node_by_id[index][0], self.n_node_by_id, self.n_edge_types) annotation = create_embedding_matrix(self.node_by_id[index][0], self.node_by_type[index][0], self.n_node_by_id, self.n_node_by_type) target = self.node_by_id[index][2] - 1 return am, annotation, target def __len__(self): return len(self.node_by_id) if __name__ == '__main__': # data = load_graphs_from_file( # "/Users/liufan/program/PYTHON/sap2nd/GnnForPrivacyScan/data/traindata/train/Directory.txt") # a = 5 bi = Dataset( "I:\Program\Python\sap\GnnForPrivacyScan\data\\traindata\\train", True) for data in bi: a = 5