main.py 45 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227
  1. def pseudonymize(colname):
  2. if colname not in key:
  3. sha3 = hashlib.sha3_512()
  4. data = salt + colname
  5. sha3.update(data.encode('utf-8'))
  6. hexdigest = sha3.hexdigest()
  7. key[colname] = hexdigest
  8. return hexdigest
  9. else:
  10. return key[colname]
  11. import logging
  12. from typing import Tuple, Union, List
  13. import microdata_validator
  14. from job_executor.exception import BuilderStepError
  15. from job_executor.adapter import pseudonym_service
  16. from job_executor.model import Metadata
  17. logger = logging.getLogger()
  18. def _get_unit_types(
  19. metadata: Metadata
  20. ) -> Tuple[Union[str, None], Union[str, None]]:
  21. return (
  22. metadata.get_identifier_key_type_name(),
  23. metadata.get_measure_key_type_name()
  24. )
  25. def _pseudonymize_identifier_only(
  26. input_csv_path: str,
  27. unit_id_type: str,
  28. job_id: str
  29. ) -> str:
  30. unique_identifiers = set()
  31. with open(input_csv_path, newline='', encoding='utf8') as csv_file:
  32. for line in csv_file:
  33. unit_id = line.strip().split(';')[1]
  34. unique_identifiers.add(unit_id)
  35. identifier_to_pseudonym = pseudonym_service.pseudonymize(
  36. list(unique_identifiers), unit_id_type, job_id
  37. )
  38. output_csv_path = input_csv_path.replace('.csv', '_pseudonymized.csv')
  39. target_file = open(output_csv_path, 'w', newline='', encoding='utf-8')
  40. with open(input_csv_path, newline='', encoding='utf-8') as csv_file:
  41. for line in csv_file:
  42. row = line.strip().split(';')
  43. line_number: int = row[0]
  44. unit_id: str = row[1]
  45. value: str = row[2]
  46. start_date: str = row[3]
  47. stop_date: str = row[4]
  48. target_file.write(
  49. ';'.join([
  50. str(line_number),
  51. str(identifier_to_pseudonym[unit_id]),
  52. value,
  53. start_date, stop_date
  54. ]) + '\n'
  55. )
  56. target_file.close()
  57. return output_csv_path
  58. def _pseudonymize_measure_only(
  59. input_csv_path: str,
  60. unit_id_type: str,
  61. job_id: str
  62. ) -> str:
  63. unique_measure_values = set()
  64. with open(input_csv_path, newline='', encoding='utf-8') as csv_file:
  65. for line in csv_file:
  66. value = line.strip().split(';')[2]
  67. unique_measure_values.add(value)
  68. value_to_pseudonym = pseudonym_service.pseudonymize(
  69. list(unique_measure_values), unit_id_type, job_id
  70. )
  71. output_csv_path = input_csv_path.replace('.csv', '_pseudonymized.csv')
  72. target_file = open(output_csv_path, 'w', newline='', encoding='utf-8')
  73. with open(input_csv_path, newline='', encoding='utf-8') as csv_file:
  74. for line in csv_file:
  75. row = line.strip().split(';')
  76. line_number: int = row[0]
  77. unit_id: str = row[1]
  78. value: str = row[2]
  79. start_date: str = row[3]
  80. stop_date: str = row[4]
  81. target_file.write(
  82. ';'.join([
  83. str(line_number),
  84. unit_id,
  85. str(value_to_pseudonym[value]),
  86. start_date, stop_date
  87. ]) + '\n'
  88. )
  89. target_file.close()
  90. return output_csv_path
  91. def _pseudonymize_identifier_and_measure(
  92. input_csv_path: str,
  93. identifier_unit_id_type: str,
  94. measure_unit_id_type: str,
  95. job_id: str
  96. ) -> str:
  97. unique_idents = set()
  98. unique_measure_values = set()
  99. with open(input_csv_path, newline='', encoding='utf-8') as csv_file:
  100. for line in csv_file:
  101. row = line.strip().split(';')
  102. unit_id = row[1]
  103. value = row[2]
  104. unique_idents.add(unit_id)
  105. unique_measure_values.add(value)
  106. identifier_to_pseudonym = pseudonym_service.pseudonymize(
  107. list(unique_idents), identifier_unit_id_type, job_id
  108. )
  109. value_to_pseudonym = pseudonym_service.pseudonymize(
  110. list(unique_measure_values), measure_unit_id_type, job_id
  111. )
  112. output_csv_path = input_csv_path.replace('.csv', '_pseudonymized.csv')
  113. target_file = open(output_csv_path, 'w', newline='', encoding='utf-8')
  114. with open(input_csv_path, newline='', encoding='utf-8') as csv_file:
  115. for line in csv_file:
  116. row = line.strip().split(';')
  117. line_number: int = row[0]
  118. unit_id: str = row[1]
  119. value: str = row[2]
  120. start_date: str = row[3]
  121. stop_date: str = row[4]
  122. target_file.write(
  123. ';'.join([
  124. str(line_number),
  125. str(identifier_to_pseudonym[unit_id]),
  126. str(value_to_pseudonym[value]),
  127. start_date, stop_date
  128. ]) + '\n'
  129. )
  130. target_file.close()
  131. return output_csv_path
  132. def _pseudonymize_csv(
  133. input_csv_path: str,
  134. identifier_unit_id_type: Union[str, None],
  135. measure_unit_id_type: Union[str, None],
  136. job_id: str
  137. ) -> str:
  138. if identifier_unit_id_type and not measure_unit_id_type:
  139. logger.info('Pseudonymizing identifier')
  140. return _pseudonymize_identifier_only(
  141. input_csv_path, identifier_unit_id_type, job_id
  142. )
  143. elif measure_unit_id_type and not identifier_unit_id_type:
  144. logger.info('Pseudonymizing measure')
  145. return _pseudonymize_measure_only(
  146. input_csv_path, measure_unit_id_type, job_id
  147. )
  148. elif identifier_unit_id_type and measure_unit_id_type:
  149. logger.info('Pseudonymizing identifier and measure')
  150. return _pseudonymize_identifier_and_measure(
  151. input_csv_path,
  152. identifier_unit_id_type,
  153. measure_unit_id_type,
  154. job_id
  155. )
  156. else:
  157. logger.info('No pseudonymization')
  158. return input_csv_path
  159. def run(input_csv_path: str, metadata: Metadata, job_id: str) -> str:
  160. """
  161. Pseudonymizes the identifier column of the dataset. Requests pseudonyms
  162. from an external service and replaces all values in the identifier column.
  163. """
  164. try:
  165. logger.info(f'Pseudonymizing data {input_csv_path}')
  166. identifier_unit_type, measure_unit_type = (
  167. _get_unit_types(metadata)
  168. )
  169. identifier_unit_id_type = (
  170. None if identifier_unit_type is None
  171. else microdata_validator.get_unit_id_type_for_unit_type(
  172. identifier_unit_type
  173. )
  174. )
  175. measure_unit_id_type = (
  176. None if measure_unit_type is None
  177. else microdata_validator.get_unit_id_type_for_unit_type(
  178. measure_unit_type
  179. )
  180. )
  181. output_file = _pseudonymize_csv(
  182. input_csv_path,
  183. identifier_unit_id_type,
  184. measure_unit_id_type,
  185. job_id
  186. )
  187. logger.info(f'Pseudonymization step done {output_file}')
  188. return output_file
  189. except Exception as e:
  190. logger.error(f'Error during pseudonymization: {str(e)}')
  191. raise BuilderStepError('Failed to pseudonymize dataset') from e
  192. def pseudonymize_1(self, df, schema): #: list[list[str]]):
  193. """ Performs pseudonymization of the given dataframe based on the provided schema.
  194. For example, if the given df is for an entity called person,
  195. 2 dataframes will be returned, one called person that has hashed ids and masked fields,
  196. and one called person_lookup that contains the original person_id, person_id_pseudo,
  197. and the non-masked values for columns marked to be masked."""
  198. df_pseudo = df_lookup = df
  199. for col_name, dtype, op in schema:
  200. if op == "hash-no-lookup" or op == "hnl":
  201. # This means that the lookup can be performed against a different table so no lookup is needed.
  202. df_pseudo = df_pseudo.withColumn(col_name, F.sha2(F.concat(F.col(col_name), F.lit(self.salt)),
  203. 256)).withColumnRenamed(col_name,
  204. col_name + "_pseudonym")
  205. df_lookup = df_lookup.drop(col_name)
  206. elif op == "hash" or op == 'h':
  207. df_pseudo = df_pseudo.withColumn(col_name, F.sha2(F.concat(F.col(col_name), F.lit(self.salt)),
  208. 256)).withColumnRenamed(col_name,
  209. col_name + "_pseudonym")
  210. df_lookup = df_lookup.withColumn(col_name + "_pseudonym",
  211. F.sha2(F.concat(F.col(col_name), F.lit(self.salt)), 256))
  212. elif op == "mask" or op == 'm':
  213. df_pseudo = df_pseudo.withColumn(col_name, F.lit('*'))
  214. elif op == "partition-by":
  215. pass # make no changes for this column so that it will be in both dataframes and can be used for partitioning
  216. elif op == "no-op" or op == 'x':
  217. df_lookup = df_lookup.drop(col_name)
  218. df_pseudo = self.fix_column_names(df_pseudo)
  219. df_lookup = self.fix_column_names(df_lookup)
  220. return (df_pseudo, df_lookup)
  221. def pseudonymize_2(value, salt=SALT_KEY):
  222. """Pseudonymize value with salt, using HMAC-SHA256 encoding
  223. Parameters
  224. ----------
  225. value: value to be pseudonymized
  226. salt: hazard salt for additional protection
  227. Returns
  228. -------
  229. pseudonymized value using HMAC-SHA256
  230. """
  231. # NOTE: Here we must bypass empty or None value as
  232. # it will introduce specific hash value
  233. if value is None or value is np.nan or value == '':
  234. return None
  235. return hmac.new(
  236. key=salt.encode('utf-8'), # La clé
  237. msg=str(value).encode('utf-8'), # La donnée à pseudonymiser
  238. digestmod=hashlib.sha256 # La fonction de hash
  239. ).hexdigest() # L’encodage en hexadécimal
  240. def pseudonymize_row(row):
  241. """
  242. Replace some identifying information with others:
  243. - Fake name
  244. - Birthdate is replaced with the age
  245. """
  246. anonymized_row = row.copy()
  247. # using Faker (https://faker.readthedocs.io/en/master/), we generate fake names
  248. if anonymized_row['Gender'] == 'Female':
  249. anonymized_row['Fullname'] = faker.name_female()
  250. else:
  251. anonymized_row['Fullname'] = faker.name_male()
  252. del anonymized_row['Birthdate']
  253. birthdate = datetime.strptime(row['Birthdate'], '%Y-%m-%d')
  254. age = today.year - birthdate.year - ((today.month, today.day) < (birthdate.month, birthdate.day))
  255. anonymized_row['Age'] = age
  256. return anonymized_row
  257. def anonymize_one(self, column, delete: bool, pattern: AnonymizationPattern = None):
  258. if column is None:
  259. return Logger.log_none_type_error('column')
  260. Logger.log_info_table_manipulation_started(self.filename, f'Anonymize One ({column})')
  261. # delete column in every dataset if found
  262. error_count = 0
  263. if delete:
  264. for ds in self.datasets:
  265. out = ds.delete_column(column)
  266. if out < 1:
  267. error_count += 1
  268. self.remove_columnnames([column])
  269. else:
  270. # if column is not deleted: generate a value for column, random or by pattern
  271. if pattern is None:
  272. for ds in self.datasets:
  273. out = ds.set_columnvalue_random(column)
  274. if out < 1:
  275. error_count += 1
  276. else:
  277. for ds in self.datasets:
  278. out = ds.set_columnvalue_by_pattern(column, pattern)
  279. if out < 1:
  280. error_count += 1
  281. Logger.log_info_table_manipulation_finished(error_count)
  282. return error_count
  283. def pseudonymize_3(field):
  284. return sha256(field.encode() + get_seed(seed).encode()).hexdigest()[:20]
  285. def pseudonymize_columns(dataframe, cols,
  286. ps_key='test',
  287. api_key=SHARED_KEY):
  288. actions = [
  289. {"name": "pseudonymize-{}".format(c),
  290. "transform-value": {
  291. "key": c,
  292. "pseudonymize": {
  293. "method": "merengue",
  294. "key": ps_key,
  295. }
  296. }
  297. } for c in cols]
  298. items = dataframe.fillna('').T.to_dict()
  299. item_list = list(items.values())
  300. data = requests.post(
  301. 'https://api.kiprotect.com/v1/transform',
  302. data=json.dumps(
  303. {"actions": actions, "items": item_list},
  304. allow_nan=False),
  305. headers={
  306. 'Authorization': 'Bearer {}'.format(api_key)})
  307. return pd.DataFrame(data.json()['items'])
  308. def _parse_url_parts(self, tld_extractor: TLDExtract, url_str: str) -> dict:
  309. url = tld_extractor(url_str)
  310. parts = {}
  311. parts["scheme"] = self._find_first(r"^([a-z0-9]+)\:\/\/", url_str)
  312. parts["auth"] = self._find_first(r"(?:.*\/\/|^)(.*:.*)@.*", url_str)
  313. parts["domain"] = url.domain
  314. parts["subdomain"] = url.subdomain
  315. parts["suffix"] = url.suffix
  316. url_list = ".".join(list(url))
  317. parts["path"] = self._find_first(
  318. rf"(?:^[a-z0-9]+\:\/\/)?{url_list}(?:\:\d+)?([^#^\?]*).*", url_str
  319. )
  320. parts["query"] = self._find_first(r".*(\?\w+=[a-zA-Z0-9](?:&\w+=[a-zA-Z0-9]+)*).*", url_str)
  321. parts["fragment"] = self._find_first(r".*#(.*)", url_str)
  322. return parts
  323. def _pseudonymize_value(self, value: str, pseudonyms: list[dict]) -> str:
  324. hash_string = self._hasher.hash_str(value, salt=self._config.hash_salt)
  325. if self._cache.requires_storing(hash_string):
  326. encrypted_origin = self._encrypter.encrypt(value)
  327. pseudonyms.append({"pseudonym": hash_string, "origin": encrypted_origin})
  328. return self._wrap_hash(hash_string)
  329. def base64_method(data_path, columns):
  330. data = pd.read_csv(data_path)
  331. data.dropna()
  332. data.reset_index(drop=True, inplace=True)
  333. existing_columns = list(data)
  334. for column in columns:
  335. if column in existing_columns:
  336. data[column] = data[column].apply(str)
  337. data[column] = data[column].apply(lambda x: base64.b64encode(bytes(x, 'utf-8')))
  338. return data
  339. def pseudonymize_4(self, s):
  340. sl = len(s) / self.__byte
  341. return struct.unpack('<%dh' % sl, s)
  342. def _replace_name(item, value, field, dicom):
  343. sex = dicom.get("PatientSex")
  344. sex = {"F": "Female", "M": "Male", "O": "Other", "": "Unk"}[sex]
  345. age = Deider._round_to_nearest(parse_AS_as_int(dicom.get("PatientAge")), 5)
  346. return f"{sex} {age:03d}Y {dicom.get('Modality')}"
  347. def apply(config, val):
  348. """ Pseudonymize using format preserving encryption.
  349. Example config:
  350. {
  351. 'func': 'fpe',
  352. 'key': 'some-secret-key',
  353. 'alphabet': string.ascii_letters
  354. }
  355. """
  356. validate_func_params(config, MANDATORY_CONFIG_PARAMS)
  357. try:
  358. alphabet = config.get('alphabet', string.printable)
  359. e = pyffx.String(config['key'].encode("utf-8"), alphabet, length=len(val))
  360. return e.encrypt(val)
  361. except ValueError:
  362. raise PseudoFuncError("Could not pseudonymize '{0}'. Check alphabet compatibility ({1})".format(val, alphabet))
  363. def pseudonymize_6(text: str, tagger: SequenceTagger) -> Tuple[str, str]:
  364. """
  365. Perform the pseudonymization action and return both the tagged version (see function "tag_entities") and the pseudonymized version
  366. Args:
  367. text (str): the input text to pseudonymize
  368. tagger (SequenceTagger): the flair model for NER
  369. Returns:
  370. Tuple[str, str]: the original text with tags, and the pseudonymized text
  371. """
  372. with sw.timer("root"):
  373. text_sentences = [Sentence(t.strip()) for t in text.split("\n") if t.strip()]
  374. with sw.timer("model_annotation"):
  375. # inplace function
  376. tagger.predict(
  377. sentences=text_sentences,
  378. mini_batch_size=32,
  379. embedding_storage_mode="none",
  380. verbose=True,
  381. )
  382. return tag_entities(sentences=text_sentences)
  383. def get_replacement_stock() -> List[str]:
  384. """
  385. A list of faked names to replace the information you want to hide
  386. """
  387. stock = [f"{letter}..." for letter in ascii_uppercase] + [
  388. f"{a}{b}..." for a, b in list(itertools.combinations(ascii_uppercase, 2))
  389. ]
  390. random.shuffle(stock)
  391. return stock
  392. def apply_tagging_sentence(
  393. starts: List[int],
  394. ends: List[int],
  395. tags: List[str],
  396. entities: List[str],
  397. plain_text: str,
  398. replacement_dict: Dict[str, str],
  399. ) -> Tuple[str, str]:
  400. """
  401. Args:
  402. starts, ends, tags, entity texts of the entities found in the sentence + the text of the sentence + the prepared replacement dictionary for pseudo
  403. Returns:
  404. str, str: a text where the entities have a XML tag, and a text where entities have been pseudonymized
  405. """
  406. assert (
  407. len(starts) == len(ends) == len(tags) == len(entities)
  408. ), "Input lists mast be of the same length"
  409. shift_tags_start, shift_tags_end = 0, 0 # shift due to the add of tags
  410. shift_pseudo_start, shift_pseudo_end = 0, 0
  411. tagged_sentence, pseudo_sentence = plain_text, plain_text
  412. n_entities = len(starts)
  413. for i in range(n_entities):
  414. start, end, entity, tag = starts[i], ends[i], entities[i], tags[i]
  415. replacement = replacement_dict[entity]
  416. pseudo_sentence = (
  417. pseudo_sentence[: start + shift_pseudo_start]
  418. + replacement
  419. + pseudo_sentence[end + shift_pseudo_end:]
  420. )
  421. shift_pseudo_start += len(replacement) - (end - start)
  422. shift_pseudo_end += len(replacement) - (end - start)
  423. tagged_sentence = (
  424. tagged_sentence[: start + shift_tags_start]
  425. + "</a>"
  426. + f"<{tag}>"
  427. + plain_text[start:end]
  428. + f"</{tag}>"
  429. + "<a>"
  430. + tagged_sentence[end + shift_tags_end:]
  431. )
  432. shift_tags_start += (
  433. 5 + 6 + 3 + 4
  434. ) # 5 characters for tag <PER> (or LOC or ORG) + 6 for </PER> + 3 for <a> and 4 for </a>
  435. shift_tags_end += (
  436. 5 + 6 + 3 + 4
  437. ) # 5 characters for tag <PER> (or LOC or ORG) + 6 for </PER> + 3 for <a> and 4 for </a>
  438. tagged_sentence = "<a>" + tagged_sentence + "</a>"
  439. tagged_sentence = tagged_sentence.replace("<a></a>", "")
  440. return (
  441. f"<sentence>{tagged_sentence}</sentence>",
  442. pseudo_sentence,
  443. )
  444. def english_pseudo(text):
  445. anon = AnonymizerChain(Anonymization('en_US'))
  446. anon.add_anonymizers(EmailAnonymizer, NamedEntitiesAnonymizer('en_core_web_lg'))
  447. clean_text, patch = anon.pseudonymize(text)
  448. def pseudonymize_user_name(self, user_name: UserName) -> PseudoUserName:
  449. hasher = hashlib.sha256()
  450. hasher.update(user_name.encode('utf-8'))
  451. # salt
  452. hasher.update(b'\0')
  453. hasher.update(self.salt)
  454. pseudonymized = base64.b64encode(hasher.digest()).decode('utf-8')
  455. return PseudoUserName(pseudonymized)
  456. def parse_lines(text):
  457. lines = []
  458. for m in LINE_RE.finditer(text):
  459. ln = {"TIMESTAMP": parse_date(m.group(1).strip("\n").strip()),
  460. "SPEAKER": m.group(2).strip(),
  461. "MESSAGE": m.group(3).strip()}
  462. lines.append(ln)
  463. return
  464. def pseudonymize_7(graph: ProvDocument) -> ProvDocument:
  465. log.info(f"pseudonymize agents in {graph=}")
  466. # get all records except for agents and relations
  467. records = list(graph.get_records((ProvActivity, ProvEntity)))
  468. pseudonyms = dict()
  469. for agent in graph.get_records(ProvAgent):
  470. name = get_attribute(agent, USERNAME)
  471. mail = get_attribute(agent, USEREMAIL)
  472. if name is None:
  473. raise ValueError("ProvAgent representing a user has to have a name!")
  474. # hash name & mail if present
  475. namehash = hashlib.sha256(bytes(name, "utf-8")).hexdigest()
  476. mailhash = hashlib.sha256(bytes(mail, "utf-8")).hexdigest() if mail else None
  477. # create a new id as a pseudonym using the hashes
  478. pseudonym = qualified_name(f"User?name={namehash}&email={mailhash}")
  479. # map the old id to the pseudonym
  480. pseudonyms[agent.identifier] = pseudonym
  481. # keep only prov role & prov type
  482. # replace name & mail with hashes
  483. pseudonymized = pseudonymize_agent(
  484. agent,
  485. identifier=pseudonym,
  486. keep=[PROV_ROLE, PROV_TYPE],
  487. replace={USERNAME: namehash, USEREMAIL: mailhash},
  488. )
  489. # add pseudonymized agent to the list of records
  490. records.append(pseudonymized)
  491. # replace old id occurences with the pseudonymized id
  492. for relation in graph.get_records(ProvRelation):
  493. formal = [(key, pseudonyms.get(val, val)) for key, val in relation.formal_attributes]
  494. extra = [(key, pseudonyms.get(val, val)) for key, val in relation.extra_attributes]
  495. r_type = PROV_REC_CLS.get(relation.get_type())
  496. records.append(r_type(relation.bundle, relation.identifier, formal + extra))
  497. return graph_factory(records)
  498. def _make_sentence(self, tokens_left, tokens_right, seq_length=128):
  499. len_left = len(tokens_left)
  500. len_right = len(tokens_right)
  501. cut_len = len_left + len_right - (seq_length - 1)
  502. if cut_len > 0:
  503. cut_left = len_left - seq_length // 2
  504. cut_right = len_right - (seq_length - 1) // 2
  505. if cut_left < 0:
  506. cut_left, cut_right = 0, cut_left + cut_right
  507. elif cut_right < 0:
  508. cut_left, cut_right = cut_left + cut_right, 0
  509. else:
  510. cut_left, cut_right = 0, 0
  511. tokens_left = tokens_left[cut_left:]
  512. # tokens_right = tokens_right[:-cut_right]
  513. tokens_right = tokens_right[:len(tokens_right) - cut_right]
  514. tokens = tokens_left + [self.bert_tokenizer.mask_token] + tokens_right
  515. attention_mask = [1] * len(tokens_left) + [1] + [1] * len(tokens_right)
  516. if len(tokens) < seq_length:
  517. num_padding = seq_length - len(tokens)
  518. tokens += [self.bert_tokenizer.pad_token] * num_padding
  519. attention_mask += [0] * num_paddi
  520. def _random_word_context(self, text, max_trial=10):
  521. puncs = list("[]!\"#$%&'()*+,./:;<=>?@\^_`{|}~-")
  522. words = text.split()
  523. trial = 0
  524. done = False
  525. while trial < max_trial and not done:
  526. trial += 1
  527. w_idx = random.randint(0, len(words) - 1)
  528. word, left_res, right_res = words[w_idx], [], []
  529. # If the word is already in vocab, it's good to go.
  530. if len(word) >= self.min_word_len and \
  531. (word.lower() in self.dictionary) and \
  532. len(word) < DEFAULT_MAX_CHARACTER_POSITIONS - 4:
  533. done = True
  534. else:
  535. # Otherwise, detach puncs at the first and the last char, and check again
  536. if word[0] in puncs:
  537. word, left_res = word[1:], [word[0]]
  538. else:
  539. word, left_res = word, []
  540. if not word: continue # The word was just a punc
  541. if word[-1] in puncs:
  542. word, right_res = word[:-1], [word[-1]]
  543. else:
  544. word, right_res = word, []
  545. if len(word) < self.min_word_len or \
  546. (not word.lower() in self.dictionary) or \
  547. len(word) >= DEFAULT_MAX_CHARACTER_POSITIONS - 4:
  548. continue
  549. # Check whether it's anonymized field
  550. right_snip = ' '.join(words[w_idx + 1:w_idx + 5])
  551. if '**]' in right_snip and '[**' not in right_snip:
  552. continue
  553. left_snip = ' '.join(words[w_idx - 4:w_idx])
  554. if '[**' in left_snip and '**]' not in left_snip:
  555. continue
  556. # Pass!
  557. done = True
  558. if done:
  559. return word, ' '.join(words[:w_idx] + left_res), ' '.join(right_res + words[w_idx + 1:])
  560. else:
  561. raise ValueError('failed to choose word')
  562. def _random_word_context(self, text, max_trial=10):
  563. puncs = list("[]!\"#$%&'()*+,./:;<=>?@\^_`{|}~-")
  564. words = text.split()
  565. trial = 0
  566. done = False
  567. while trial < max_trial and not done:
  568. trial += 1
  569. w_idx = random.randint(0, len(words) - 1)
  570. word, left_res, right_res = words[w_idx], [], []
  571. # If the word is already in vocab, it's good to go.
  572. if len(word) >= self.min_word_len and \
  573. (word.lower() in self.dictionary) and \
  574. len(word) < DEFAULT_MAX_CHARACTER_POSITIONS - 4:
  575. done = True
  576. else:
  577. # Otherwise, detach puncs at the first and the last char, and check again
  578. if word[0] in puncs:
  579. word, left_res = word[1:], [word[0]]
  580. else:
  581. word, left_res = word, []
  582. if not word: continue # The word was just a punc
  583. if word[-1] in puncs:
  584. word, right_res = word[:-1], [word[-1]]
  585. else:
  586. word, right_res = word, []
  587. if len(word) < self.min_word_len or \
  588. (not word.lower() in self.dictionary) or \
  589. len(word) >= DEFAULT_MAX_CHARACTER_POSITIONS - 4:
  590. continue
  591. # Check whether it's anonymized field
  592. right_snip = ' '.join(words[w_idx + 1:w_idx + 5])
  593. if '**]' in right_snip and '[**' not in right_snip:
  594. continue
  595. left_snip = ' '.join(words[w_idx - 4:w_idx])
  596. if '[**' in left_snip and '**]' not in left_snip:
  597. continue
  598. # Pass!
  599. done = True
  600. if done:
  601. return word, ' '.join(words[:w_idx] + left_res), ' '.join(right_res + words[w_idx + 1:])
  602. else:
  603. raise ValueError('failed to choose word')
  604. def __next__(self):
  605. # Select next note (length >= 2000)
  606. while True:
  607. try:
  608. _, row = next(self.note_iterrows)
  609. except StopIteration:
  610. self._load_random_csv()
  611. _, row = next(self.note_iterrows)
  612. note_id = int(row.ROW_ID)
  613. note = row.TEXT.strip()
  614. # if len(note) >= 2000:
  615. # break
  616. if len(note) < 2000:
  617. continue
  618. try:
  619. correct, left, right = self._random_word_context(note)
  620. except:
  621. # import traceback; traceback.print_exc();
  622. continue
  623. break
  624. # Corrupt and pseudonymize
  625. correct = correct.lower()
  626. if random.uniform(0, 1) >= self.no_corruption_prob:
  627. typo = self.word_corrupter.corrupt_word(correct)
  628. else:
  629. typo = correct
  630. left = self.mimic_pseudo.pseudonymize(left)
  631. left = self._process_note(left)
  632. left = ' '.join(left.split(' ')[-128:])
  633. right = self.mimic_pseudo.pseudonymize(right)
  634. right = self._process_note(right)
  635. right = ' '.join(right.split(' ')[:128])
  636. # Parse
  637. temp_csv_row = [-1, note_id, typo, left, right, correct]
  638. # print(f'{self.csv_fname}({note_id}, {_}/{len(self.df_note)}): {correct} -> {typo}')
  639. example = self._parse_row(temp_csv_row)
  640. return example
  641. def pseudonymize_8(self, s):
  642. return struct.unpack(">" + ("I" * (len(s) / self.__stride)), s)
  643. def pseudonymize(field):
  644. return sha256(field.encode() + salt.encode()).hexdigest()[:16]
  645. def pseudonymize(
  646. self,
  647. original_text: str,
  648. presidio_response: List[RecognizerResult],
  649. count: int,
  650. ):
  651. """
  652. :param original_text: str containing the original text
  653. :param presidio_response: list of results from Presidio, to be used to know where entities are
  654. :param count: number of perturbations to return
  655. :return: List[str] with fake perturbations of original text
  656. """
  657. presidio_response = sorted(presidio_response, key=lambda resp: resp.start)
  658. anonymizer_engine = AnonymizerEngine()
  659. anonymized_result = anonymizer_engine.anonymize(
  660. text=original_text, analyzer_results=presidio_response
  661. )
  662. templated_text = anonymized_result.text
  663. templated_text = templated_text.replace(">", "}}").replace("<", "{{")
  664. fake_texts = [self.parse(templated_text, add_spans=False) for _ in range(count)]
  665. return fake_texts
  666. def pseudonymize(
  667. self, key_file: KeyFile, identifiers: List["Identifier"]
  668. ) -> List["Key"]:
  669. """Get a pseudonym for each identifier. If identifier is known in PIMS,
  670. return this. Otherwise, have PIMS generate a new pseudonym and return that.
  671. Parameters
  672. ----------
  673. identifiers: List[Identifier]
  674. The identifiers to get pseudonyms for
  675. key_file: KeyFile
  676. The key_file to use
  677. Notes
  678. -----
  679. Each call this function calls PIMS API twice for each unique source in
  680. identifiers. This is result of the way the API can be called
  681. Returns
  682. -------
  683. List[Key]
  684. The PIMS pseudonym for each identifier
  685. """
  686. keys = []
  687. # Each call to process a list of identifiers only allows a single source.
  688. # Split identifiers by source
  689. per_source = defaultdict(list)
  690. for x in identifiers:
  691. per_source[x.source].append(x)
  692. for source, items in per_source.items():
  693. keys = keys + self.deidentify(key_file, [x.value for x in items], source)
  694. return keys
  695. def pseudonymize(self, s):
  696. sl = len(s) / 2
  697. return struct.unpack('<%dh' % sl, s)
  698. def regex_anonymizer(self, text: str, regex: Pattern, provider: str) -> str:
  699. '''
  700. Anonymize all substring matching a specific regex using a Faker provider
  701. '''
  702. matchs = re.findall(regex, text)
  703. return self.replace_all(text, matchs, provider)
  704. def psdnmyz_2():
  705. # load TWO csv to be sent to be pseudonymz
  706. # metrics_df=pd.read_csv('/home/arasan/testrep/psmd/jureca/TOTAL_METRICS_Skel_header.csv')
  707. seg_df = pd.read_csv('/home/arasan/testrep/psmd/jureca/psmd_seg_vols.csv')
  708. # add rnadom id column to both df
  709. # below line is a disaster
  710. # metrics_df['RNDNAME'] = metrics_df['NAME'].apply(lambda x: gocept.pseudonymize.integer(x, 'secret'))
  711. # seg_df['RNDNAME'] = seg_df['NAME'].apply(lambda x: gocept.pseudonymize.integer(x, 'secret'))
  712. # a=np.random.randint(100000,999999,metrics_df.NAME.values.size)
  713. # metrics_df['RNDNAME']=a
  714. # print 'after rqndom id has been added'
  715. # flagg=True
  716. # while(flagg):
  717. # try:
  718. # print pd.concat(g for _, g in metrics_df.groupby("RNDNAME") if len(g) > 1)
  719. # except ValueError:
  720. # print 'NO DUPLICAtes'
  721. # metrics_df.to_csv('/home/arasan/testrep/psmd/jureca/TOTAL_rnd_temp.csv')
  722. # flagg=False
  723. # else:
  724. # print 'DUPES'
  725. # metrics_df=metrics_df.drop('RNDNAME', axis=1)
  726. # a=np.random.randint(100000,999999,metrics_df.NAME.values.size)
  727. # metrics_df['RNDNAME']=a
  728. # load double chekced randomeized df 1) above try catch 2) using np unique
  729. metrnd = pd.read_csv('/home/arasan/testrep/psmd/jureca/TOTAL_rnd_temp.csv')
  730. seg_df['SNO'] = seg_df.index + 1
  731. metrnd['SNO'] = seg_df.index + 1
  732. # add RNDAME column to seg_df
  733. seg_df['RNDNAME'] = metrnd.RNDNAME.values
  734. # rename columns NANME to ID and RNDNAME to NAME
  735. seg_df = seg_df.rename(index=str, columns={"NAME": "ID"})
  736. seg_df = seg_df.rename(index=str, columns={"RNDNAME": "NAME"})
  737. metrnd = metrnd.rename(index=str, columns={"NAME": "ID"})
  738. metrnd = metrnd.rename(index=str, columns={"RNDNAME": "NAME"})
  739. # dump map out with 3 columns ID,NAME,SNO
  740. mapdf = metrnd[['ID', 'NAME', 'SNO']]
  741. mapdf.to_csv('/home/arasan/testrep/psmd/jureca/bordeaux_packet2/psdnmyz_map.csv', index=False)
  742. # drop ID and SNO
  743. seg_df = seg_df.drop(['ID', 'SNO'], axis=1)
  744. metrnd = metrnd.drop(['ID', 'SNO'], axis=1)
  745. # move NAME column to first position
  746. metrnd = metrnd[['NAME', 'mean_skel_MD_LH_RH', 'sd_skel_MD_LH_RH', 'Pw90S_skel_MD_LH_RH', 'mean_skel_FA_LH_RH',
  747. 'sd_skel_FA_LH_RH', 'mean_skel_AD_LH_RH', 'sd_skel_AD_LH_RH', 'mean_skel_RD_LH_RH',
  748. 'sd_skel_RD_LH_RH']]
  749. seg_df = seg_df[['NAME', 'AGE', 'SEX', 'GMV', 'WMV', 'CSFV', 'ICV']]
  750. # if pd.concat(g for _, g in metrics_df.groupby("RNDNAME") if len(g) > 1).RNDNAME.values.size:
  751. # print 'NOT OK'
  752. # else:
  753. # print 'OK'
  754. metrnd.to_csv('/home/arasan/testrep/psmd/jureca/bordeaux_packet2/TOTAL_METRICS_Skel_header.csv', index=False)
  755. seg_df.to_csv('/home/arasan/testrep/psmd/jureca/bordeaux_packet2/psmd_seg_vols.csv', index=False)
  756. def psdnmyz_3():
  757. # load TWO csv to be sent to be pseudonymz
  758. # metrics_df=pd.read_csv('/home/arasan/testrep/psmd/jureca/TOTAL_METRICS_Skel_header.csv')
  759. seg_df = pd.read_csv('/home/arasan/testrep/psmd/jureca/psmd_seg2_vols.csv')
  760. # add rnadom id column to both df
  761. # below line is a disaster
  762. # metrics_df['RNDNAME'] = metrics_df['NAME'].apply(lambda x: gocept.pseudonymize.integer(x, 'secret'))
  763. # seg_df['RNDNAME'] = seg_df['NAME'].apply(lambda x: gocept.pseudonymize.integer(x, 'secret'))
  764. # a=np.random.randint(100000,999999,metrics_df.NAME.values.size)
  765. # metrics_df['RNDNAME']=a
  766. # print 'after rqndom id has been added'
  767. # flagg=True
  768. # while(flagg):
  769. # try:
  770. # print pd.concat(g for _, g in metrics_df.groupby("RNDNAME") if len(g) > 1)
  771. # except ValueError:
  772. # print 'NO DUPLICAtes'
  773. # metrics_df.to_csv('/home/arasan/testrep/psmd/jureca/TOTAL_rnd_temp.csv')
  774. # flagg=False
  775. # else:
  776. # print 'DUPES'
  777. # metrics_df=metrics_df.drop('RNDNAME', axis=1)
  778. # a=np.random.randint(100000,999999,metrics_df.NAME.values.size)
  779. # metrics_df['RNDNAME']=a
  780. # load double chekced randomeized df 1) above try catch 2) using np unique
  781. metrnd = pd.read_csv('/home/arasan/testrep/psmd/jureca/TOTAL_rnd_temp.csv')
  782. seg_df['SNO'] = seg_df.index + 1
  783. # metrnd['SNO']=seg_df.index+1
  784. # add RNDAME column to seg_df
  785. seg_df['RNDNAME'] = metrnd.RNDNAME.values
  786. # rename columns NANME to ID and RNDNAME to NAME
  787. # seg_df=seg_df.rename(index=str, columns={"NAME": "ID"})
  788. seg_df = seg_df.rename(index=str, columns={"RNDNAME": "NAME"})
  789. # metrnd=metrnd.rename(index=str, columns={"NAME": "ID"})
  790. # metrnd=metrnd.rename(index=str, columns={"RNDNAME": "NAME"})
  791. # dump map out with 3 columns ID,NAME,SNO
  792. # mapdf=metrnd[['ID','NAME','SNO']]
  793. # mapdf.to_csv('/home/arasan/testrep/psmd/jureca/bordeaux_packet2/psdnmyz_map.csv',index=False)
  794. # drop ID and SNO
  795. seg_df = seg_df.drop(['ID', 'SNO'], axis=1)
  796. # metrnd=metrnd.drop(['ID','SNO'],axis=1)
  797. # move NAME column to first position
  798. # metrnd=metrnd[['NAME','mean_skel_MD_LH_RH','sd_skel_MD_LH_RH','Pw90S_skel_MD_LH_RH','mean_skel_FA_LH_RH','sd_skel_FA_LH_RH','mean_skel_AD_LH_RH','sd_skel_AD_LH_RH','mean_skel_RD_LH_RH','sd_skel_RD_LH_RH']]
  799. seg_df = seg_df[['NAME', 'AGE', 'SEX', 'ICV']]
  800. # if pd.concat(g for _, g in metrics_df.groupby("RNDNAME") if len(g) > 1).RNDNAME.values.size:
  801. # print 'NOT OK'
  802. # else:
  803. # print 'OK'
  804. # metrnd.to_csv('/home/arasan/testrep/psmd/jureca/bordeaux_packet2/TOTAL_METRICS_Skel_header.csv',index=False)
  805. seg_df.to_csv('/home/arasan/testrep/psmd/jureca/bordeaux_packet3/psmd_seg2_vols.csv', index=False)
  806. def hashPseudonym(self, i, key, tile):
  807. digest = hashes.Hash(hashes.SHA256(), default_backend())
  808. # for i in range (0,len(plainTail)): # {
  809. _digest = digest.copy()
  810. # key = secrets.token_bytes(32)
  811. _digest.update(bytes(i))
  812. _digest.update(key)
  813. _digest.update(bytes(tile))
  814. p = _digest.finalize() # }
  815. # digest.finalize()
  816. return p
  817. def test_localization_of_pseudonym(self):
  818. name = b" a 16 byte name "
  819. target = b"PEP3 storage_facility"
  820. pp = pep3_pb2.Pseudonymizable(data=name,
  821. state=pep3_pb2.Pseudonymizable.UNENCRYPTED_NAME)
  822. self.collector.pseudonymize([pp])
  823. self.collector.relocalize([pp],
  824. self.config.collector.warrants.to_sf)
  825. sfp = elgamal.Triple.unpack(pp.data) \
  826. .decrypt(self.sf.private_keys['pseudonym'])
  827. pseudonym_secrets = {}
  828. for peer_secrets in self.secrets.peers.values():
  829. for shard, shard_secrets in peer_secrets.by_shard.items():
  830. pseudonym_secrets[shard] \
  831. = shard_secrets.pseudonym_component_secret
  832. s = 1
  833. e = ed25519.scalar_unpack(common.sha256(target))
  834. for secret in pseudonym_secrets.values():
  835. s *= pow(ed25519.scalar_unpack(secret), e, ed25519.l)
  836. s %= ed25519.l
  837. self.assertEqual(
  838. sfp * ed25519.scalar_inv(s),
  839. ed25519.Point.lizard(name))
  840. def test_store_and_retrieve(self):
  841. # first store a record with random source and target ip addresses,
  842. # and see if we can recover it.
  843. col_request = pep3_pb2.StoreRequest()
  844. col_request.id = os.urandom(16)
  845. flowrecord = col_request.records.add()
  846. flowrecord.source_ip.data = os.urandom(16)
  847. flowrecord.source_ip.state = pep3_pb2.Pseudonymizable.UNENCRYPTED_NAME
  848. flowrecord.destination_ip.data = os.urandom(16)
  849. flowrecord.destination_ip.state = \
  850. pep3_pb2.Pseudonymizable.UNENCRYPTED_NAME
  851. flowrecord.anonymous_part.number_of_bytes = 123
  852. flowrecord.anonymous_part.number_of_packets = 456
  853. updates = list(self.collector.connect_to('collector').Store(
  854. iter([col_request])))
  855. self.assertEqual(len(updates), 1)
  856. self.assertEqual(updates[0].stored_id, col_request.id)
  857. # store the same flowrecord twice, to see if that causes troubles
  858. col_request.id = os.urandom(16)
  859. updates = list(self.collector.connect_to('collector').Store(
  860. iter([col_request])))
  861. self.assertEqual(len(updates), 1)
  862. self.assertEqual(updates[0].stored_id, col_request.id)
  863. query = pep3_pb2.SqlQuery()
  864. # manually compute storage_facility-local pseudonyms for query
  865. sf_name = b"PEP3 storage_facility"
  866. pseudonym_secrets = {}
  867. for peer_secrets in self.secrets.peers.values():
  868. for shard, shard_secrets in peer_secrets.by_shard.items():
  869. pseudonym_secrets[shard] \
  870. = shard_secrets.pseudonym_component_secret
  871. s = 1
  872. e = ed25519.scalar_unpack(common.sha256(sf_name))
  873. for secret in pseudonym_secrets.values():
  874. s *= pow(ed25519.scalar_unpack(secret), e, ed25519.l)
  875. s %= ed25519.l
  876. # see if the record was stored correctly by querying the
  877. # database directly.
  878. query.query = """SELECT peped_flows.p_dst_ip FROM peped_flows
  879. WHERE peped_flows.p_src_ip=:ip"""
  880. ip = query.parameters['ip'].pseudonymizable_value
  881. ip.data = (ed25519.Point.lizard(
  882. flowrecord.source_ip.data) * s).pack()
  883. ip.state = pep3_pb2.Pseudonymizable.UNENCRYPTED_PSEUDONYM
  884. row = self.sf.connect_to('database') \
  885. .Query(query).next().rows[0]
  886. self.assertEqual(row.cells[0].pseudonymizable_value.data,
  887. (ed25519.Point.lizard(flowrecord.destination_ip.data) * s
  888. ).pack())
  889. # manually compute researcher-local pseudonyms for query
  890. researcher_name = b"PEP3 researcher"
  891. pseudonym_secrets = {}
  892. for peer_secrets in self.secrets.peers.values():
  893. for shard, shard_secrets in peer_secrets.by_shard.items():
  894. pseudonym_secrets[shard] \
  895. = shard_secrets.pseudonym_component_secret
  896. s = 1
  897. e = ed25519.scalar_unpack(common.sha256(researcher_name))
  898. for secret in pseudonym_secrets.values():
  899. s *= pow(ed25519.scalar_unpack(secret), e, ed25519.l)
  900. s %= ed25519.l
  901. # now query via the researcher
  902. query.parameters['ip'].pseudonymizable_value.data \
  903. = (ed25519.Point.lizard(flowrecord.source_ip.data) * s).pack()
  904. row = self.researcher.connect_to('researcher') \
  905. .Query(query).next().rows[0]
  906. self.assertEqual(row.cells[0].pseudonymizable_value.data,
  907. (ed25519.Point.lizard(flowrecord.destination_ip.data) * s
  908. ).pack())
  909. def test_depseudonymize(self):
  910. ip = os.urandom(16)
  911. # manually compute investigator-local pseudonym
  912. pseudonym_secrets = {}
  913. for peer_secrets in self.secrets.peers.values():
  914. for shard, shard_secrets in peer_secrets.by_shard.items():
  915. pseudonym_secrets[shard] \
  916. = shard_secrets.pseudonym_component_secret
  917. s = 1
  918. e = ed25519.scalar_unpack(common.sha256(b"PEP3 investigator"))
  919. for secret in pseudonym_secrets.values():
  920. s *= pow(ed25519.scalar_unpack(secret), e, ed25519.l)
  921. s %= ed25519.l
  922. investigator_local_ip = (ed25519.Point.lizard(ip) * s).pack()
  923. # manually create warrant
  924. warrant = pep3_pb2.DepseudonymizationRequest.Warrant()
  925. warrant.act.actor = b"PEP3 investigator"
  926. warrant.act.name.state = pep3_pb2.Pseudonymizable.UNENCRYPTED_PSEUDONYM
  927. warrant.act.name.data = investigator_local_ip
  928. self.investigator.encrypt([warrant.act.name],
  929. self.investigator.public_keys['pseudonym'])
  930. warrant.signature = crypto.sign(
  931. crypto.load_privatekey(crypto.FILETYPE_PEM,
  932. self.secrets.root_certificate_keys.warrants),
  933. warrant.act.SerializeToString(), 'sha256')
  934. result = self.investigator.connect_to("investigator") \
  935. .Depseudonymize(warrant)
  936. self.assertEqual(result.data, ip)
  937. def anonymize(cls, user, ldap_attrs, **kwargs):
  938. # type: (User, Dict[AnyStr, Any], **Any) -> Dict[AnyStr, AnyStr]
  939. """
  940. Change values of function arguments to anonymize/pseudonymize user if
  941. UCRV asm/attributes/<staff/student>/anonymize is true. Will return
  942. unchanged function arguments otherwise.
  943. :param User user: user object
  944. :param dict ldap_attrs: dictionary with the users LDAP attributes
  945. :return: dictionary with [modified] function arguments
  946. :rtype: dict
  947. :raises NotImplementedError: if cls.ucr_anonymize_key_base is unset
  948. """
  949. ucr = get_ucr()
  950. if ucr.is_true(cls.ucr_anonymize_key_base):
  951. for k, v in cls.anonymize_mapping().items():
  952. if v and v.startswith('%'):
  953. attr = v[1:].strip()
  954. try:
  955. v = ldap_attrs[attr][0]
  956. except KeyError:
  957. raise ValueError('Attribute {!r} not found in LDAP object of {}.'.format(attr, user))
  958. except IndexError:
  959. raise ValueError('Attribute {!r} empty in LDAP object of {}.'.format(attr, user))
  960. kwargs[k] = v
  961. return kwargs
  962. def _modify_dataset(
  963. self,
  964. anonymizer: Anonymizer,
  965. pseudonym: str,
  966. ds: Dataset,
  967. ) -> None:
  968. """Optionally pseudonymize an incoming dataset with the given pseudonym
  969. and add the trial ID and name to the DICOM header if specified."""
  970. if pseudonym:
  971. # All dates get pseudonymized, but we want to retain the study date.
  972. study_date = ds.StudyDate
  973. anonymizer.anonymize(ds)
  974. ds.StudyDate = study_date
  975. ds.PatientID = pseudonym
  976. ds.PatientName = pseudonym
  977. trial_protocol_id = (self.transfer_task.job.trial_protocol_id,)
  978. trial_protocol_name = self.transfer_task.job.trial_protocol_name
  979. if trial_protocol_id:
  980. ds.ClinicalTrialProtocolID = trial_protocol_id
  981. if trial_protocol_name:
  982. ds.ClinicalTrialProtocolName = trial_protocol_name
  983. if pseudonym and trial_protocol_id:
  984. session_id = f"{ds.StudyDate}-{ds.StudyTime}"
  985. ds.PatientComments = f"Project:{trial_protocol_id} Subject:{pseudonym} Session:{pseudonym}_{session_id}"
  986. def _psc1(psc1, psc2_from_psc1):
  987. if 'TEST' in psc1.upper():
  988. # skip test subjects
  989. logging.debug('skipping test subject "%s"', psc1)
  990. else:
  991. # find and skip subjects with invalid identifier
  992. if psc1[-3:] in {'FU2', 'FU3'}:
  993. psc1 = psc1[:-3]
  994. elif psc1[-2:] == 'SB':
  995. psc1 = psc1[:-2]
  996. if psc1 in psc2_from_psc1:
  997. return psc1
  998. elif psc1 in {'0x0000xxxxxx'}:
  999. logging.info('skipping known invalid subject identifier "%s"',
  1000. psc1)
  1001. else:
  1002. logging.error('invalid subject identifier "%s"', psc1)
  1003. return None
  1004. def pseudonymize_node_name(name):
  1005. """Replace Node.Name (detector ID) by a hash with secret key"""
  1006. h = hashlib.md5((app.secret_key + name).encode('utf-8'))
  1007. return 'node.' + h.hexdigest()[:6]
  1008. def pseudonymize(self, size=None):
  1009. """
  1010. Return pseudonymized values for this attribute, which is used to
  1011. substitute identifiable data with a reversible, consistent value.
  1012. """
  1013. size = size or self.size
  1014. if size != self.size:
  1015. attr = Series(np.random.choice(self.bins, size=size, p=self.prs))
  1016. else:
  1017. attr = self
  1018. if self.categorical:
  1019. mapping = {b: utils.pseudonymise_string(b) for b in self.bins}
  1020. return attr.map(lambda x: mapping[x])
  1021. if self.type == 'string':
  1022. return attr.map(utils.pseudonymise_string)
  1023. elif self.is_numerical or self.type == 'datetime':
  1024. return attr.map(str).map(utils.pseudonymise_string)
  1025. def pseudonymize(self, content):
  1026. if not content: return content
  1027. content_modified = ''
  1028. start = 0
  1029. for mo in re.finditer("\[\*\*[^\[]*\*\*\]", content):
  1030. replacement = self.mapper.get_mapping(mo.group(0))
  1031. content_modified += content[start: mo.start()]
  1032. content_modified += replacement
  1033. start = mo.end()
  1034. if start < len(content):
  1035. content_modified += content[start: len(content)]
  1036. return content_modified