nlp.py 1.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950
  1. import torch
  2. from transformers import BertModel, BertTokenizer
  3. import numpy as np
  4. import time
  5. def get_word_vec(word):
  6. # 这里我们调用bert-base模型,同时模型的词典经过小写处理
  7. model_name = 'bert-base-uncased'
  8. # 读取模型对应的tokenizer
  9. tokenizer = BertTokenizer.from_pretrained(model_name)
  10. # 载入模型
  11. model = BertModel.from_pretrained(model_name)
  12. # 输入文本
  13. input_text = word
  14. # 通过tokenizer把文本变成 token_id
  15. input_ids = torch.tensor([tokenizer.encode(input_text_i) for input_text_i in input_text])
  16. max_len = 10
  17. # while len(input_ids)<max_len:
  18. # input_ids.
  19. print(input_ids)
  20. # input_ids: [101, 2182, 2003, 2070, 3793, 2000, 4372, 16044, 102]
  21. # input_ids = torch.tensor([input_ids])
  22. # 获得BERT模型最后一个隐层结果
  23. with torch.no_grad():
  24. last_hidden_states = model(input_ids)[0] # Models outputs are now tuples
  25. # print(model(input_ids))
  26. print(last_hidden_states)
  27. print(last_hidden_states.shape)
  28. """ tensor([[[-0.0549, 0.1053, -0.1065, ..., -0.3550, 0.0686, 0.6506],
  29. [-0.5759, -0.3650, -0.1383, ..., -0.6782, 0.2092, -0.1639],
  30. [-0.1641, -0.5597, 0.0150, ..., -0.1603, -0.1346, 0.6216],
  31. ...,
  32. [ 0.2448, 0.1254, 0.1587, ..., -0.2749, -0.1163, 0.8809],
  33. [ 0.0481, 0.4950, -0.2827, ..., -0.6097, -0.1212, 0.2527],
  34. [ 0.9046, 0.2137, -0.5897, ..., 0.3040, -0.6172, -0.1950]]])
  35. shape: (1, 9, 768)
  36. """
  37. return last_hidden_states
  38. def get_cos_similar(v1: list, v2: list):
  39. num = float(np.dot(v1, v2)) # 向量点乘
  40. denom = np.linalg.norm(v1) * np.linalg.norm(v2) # 求模长的乘积
  41. return 0.5 + 0.5 * (num / denom) if denom != 0 else 0
  42. if __name__ == '__main__':
  43. vecs = get_word_vec([["psw", "name"], ["git", "name"]])
  44. print(get_cos_similar(vecs[0][0], vecs[1][0]))