import os import random import numpy as np def load_from_file(file_path, is_binary=0): """ :param file_path: :param is_binary: 0:not binary, 1:binary type, 2:binary other :return: """ node_id_data_list = [] node_type_data_list = [] node_id = list() node_type = list() graph_type = list() with open(file_path) as file: for line in file: if len(line.strip()) == 0: node_id_data_list.append([node_id, graph_type]) node_type_data_list.append([node_type, graph_type]) node_id = list() node_type = list() graph_type = list() elif len(line.split(' ')) == 3: if is_binary == 0: graph_type.append([int(line.split(' ')[1])]) elif is_binary == 1: graph_type.append([1]) else: graph_type.append([2]) else: data = line.split(' ') node_id.append([int(data[0]), int(data[2]), int(data[3])]) node_type.append([int(data[1]), int(data[2]), int(data[4])]) return node_id_data_list, node_type_data_list def load_from_directory(path): node_id_data_list = [] node_type_data_list = [] for file_name in os.listdir(path): node_id, node_type = load_from_file(path + "/" + file_name) node_id_data_list.extend(node_id) node_type_data_list.extend(node_type) return node_id_data_list, node_type_data_list def find_max_edge_id(data_list): max_edge_id = 0 for data in data_list: edges = data[0] for item in edges: if item[1] > max_edge_id: max_edge_id = item[1] return max_edge_id def find_max_node_id(data_list): max_node_id = 0 for data in data_list: edges = data[0] for item in edges: if item[0] > max_node_id: max_node_id = item[0] if item[2] > max_node_id: max_node_id = item[2] return max_node_id def convert_program_data(data_list, n_annotation_dim, n_nodes): # n_nodes = find_max_node_id(data_list) class_data_list = [] for item in data_list: edge_list = item[0] target_list = item[1] for target in target_list: task_type = target[0] task_output = target[-1] annotation = np.zeros([n_nodes, n_annotation_dim]) for edge in edge_list: src_idx = edge[0] if src_idx < len(annotation): annotation[src_idx - 1][0] = 1 class_data_list.append([edge_list, annotation, task_output]) return class_data_list def create_adjacency_matrix(edges, n_nodes, n_edge_types): a = np.zeros([n_nodes, n_nodes * n_edge_types * 2]) for edge in edges: src_idx = edge[0] e_type = edge[1] tgt_idx = edge[2] if tgt_idx < len(a): a[tgt_idx - 1][(e_type - 1) * n_nodes + src_idx - 1] = 1 if src_idx < len(a): a[src_idx - 1][(e_type - 1 + n_edge_types) * n_nodes + tgt_idx - 1] = 1 return a def create_embedding_matrix(node_id_edges, node_type_edges, n_nodes, n_types): anno = np.zeros([n_nodes, n_types]) for i in range(len(node_id_edges)): node_type = node_type_edges[i][0] # print(node_type) src_idx = node_id_edges[i][0] anno[src_idx - 1][node_type - 1] = 1.0 return anno class Dataset: """ Load bAbI tasks for GGNN """ def __init__(self, path, is_train): data_id = list() data_type = list() train_data_id, train_data_type = load_from_directory(path + "/train") test_data_id, test_data_type = load_from_directory(path + "/test") data_id.extend(train_data_id) data_id.extend(test_data_id) data_type.extend(train_data_type) data_type.extend(test_data_type) self.n_edge_types = find_max_edge_id(data_id) max_node_id = find_max_node_id(data_id) max_node_type = find_max_node_id(data_type) self.n_node_by_id = max_node_id self.n_node_by_type = max_node_type if is_train: self.node_by_id = convert_program_data(train_data_id, 1, self.n_node_by_id) self.node_by_type = convert_program_data(train_data_type, 1, self.n_node_by_type) else: self.node_by_id = convert_program_data(test_data_id, 1, self.n_node_by_id) self.node_by_type = convert_program_data(test_data_type, 1, self.n_node_by_type) def __getitem__(self, index): am = create_adjacency_matrix(self.node_by_id[index][0], self.n_node_by_id, self.n_edge_types) annotation = create_embedding_matrix(self.node_by_id[index][0], self.node_by_type[index][0], self.n_node_by_id, self.n_node_by_type) target = self.node_by_id[index][2] - 1 return am, annotation, target def __len__(self): return len(self.node_by_id) def load_from_directory_binary(path, class_type): node_id_data_list = [] node_type_data_list = [] # binary true node_id_binary_true, node_type_binary_true = load_from_file(path + "/" + class_type + ".txt", 1) node_id_data_list.extend(node_id_binary_true) node_type_data_list.extend(node_type_binary_true) id_len = len(node_id_data_list) # binary false node_id_data_list_false = [] node_type_data_list_false = [] for file_name in os.listdir(path): if file_name != class_type + ".txt": node_id_binary_false, node_type_binary_false = load_from_file(path + "/" + file_name) node_id_data_list_false.extend(node_id_binary_false) node_type_data_list_false.extend(node_type_binary_false) random.shuffle(node_id_data_list_false) random.shuffle(node_type_data_list_false) node_id_data_list.extend(node_id_data_list_false[:id_len]) node_type_data_list.extend(node_type_data_list_false[:id_len]) return node_id_data_list, node_type_data_list class BinaryDataset: def __init__(self, path, class_type, is_train): data_id = list() data_type = list() train_data_id, train_data_type = load_from_directory_binary(path + "/train", class_type) test_data_id, test_data_type = load_from_directory_binary(path + "/test", class_type) data_id.extend(train_data_id) data_id.extend(test_data_id) data_type.extend(train_data_type) data_type.extend(test_data_type) self.n_edge_types = find_max_edge_id(data_id) max_node_id = find_max_node_id(data_id) max_node_type = find_max_node_id(data_type) self.n_node_by_id = max_node_id self.n_node_by_type = max_node_type if is_train: self.node_by_id = convert_program_data(train_data_id, 1, self.n_node_by_id) self.node_by_type = convert_program_data(train_data_type, 1, self.n_node_by_type) else: self.node_by_id = convert_program_data(test_data_id, 1, self.n_node_by_id) self.node_by_type = convert_program_data(test_data_type, 1, self.n_node_by_type) def __getitem__(self, index): am = create_adjacency_matrix(self.node_by_id[index][0], self.n_node_by_id, self.n_edge_types) annotation = create_embedding_matrix(self.node_by_id[index][0], self.node_by_type[index][0], self.n_node_by_id, self.n_node_by_type) target = self.node_by_id[index][2] - 1 return am, annotation, target def __len__(self): return len(self.node_by_id) if __name__ == '__main__': # data = load_graphs_from_file( # "/Users/liufan/program/PYTHON/sap2nd/GnnForPrivacyScan/data/traindata/train/Directory.txt") # a = 5 # bi = Dataset( # "I:\Program\Python\sap\GnnForPrivacyScan\data\\traindata", True) binary_dataset = BinaryDataset("I:\Program\Python\sap\GnnForPrivacyScan\data\\traindatabinary", "Archive", True) for d in binary_dataset: a = 5