LiuFan
/
PrivacyScan


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950
							import torch
from transformers import BertModel, BertTokenizer
import numpy as np
import time


def get_word_vec(word):
    # 这里我们调用bert-base模型，同时模型的词典经过小写处理
    model_name = 'bert-base-uncased'
    # 读取模型对应的tokenizer
    tokenizer = BertTokenizer.from_pretrained(model_name)
    # 载入模型
    model = BertModel.from_pretrained(model_name)
    # 输入文本
    input_text = word
    # 通过tokenizer把文本变成 token_id
    input_ids = torch.tensor([tokenizer.encode(input_text_i) for input_text_i in input_text])
    max_len = 10
    # while len(input_ids)<max_len:
    #     input_ids.
    print(input_ids)
    # input_ids: [101, 2182, 2003, 2070, 3793, 2000, 4372, 16044, 102]
    # input_ids = torch.tensor([input_ids])
    # 获得BERT模型最后一个隐层结果
    with torch.no_grad():
        last_hidden_states = model(input_ids)[0]  # Models outputs are now tuples
        # print(model(input_ids))
    print(last_hidden_states)
    print(last_hidden_states.shape)
    """ tensor([[[-0.0549,  0.1053, -0.1065,  ..., -0.3550,  0.0686,  0.6506],
             [-0.5759, -0.3650, -0.1383,  ..., -0.6782,  0.2092, -0.1639],
             [-0.1641, -0.5597,  0.0150,  ..., -0.1603, -0.1346,  0.6216],
             ...,
             [ 0.2448,  0.1254,  0.1587,  ..., -0.2749, -0.1163,  0.8809],
             [ 0.0481,  0.4950, -0.2827,  ..., -0.6097, -0.1212,  0.2527],
             [ 0.9046,  0.2137, -0.5897,  ...,  0.3040, -0.6172, -0.1950]]]) 
        shape: (1, 9, 768)     
    """
    return last_hidden_states


def get_cos_similar(v1: list, v2: list):
    num = float(np.dot(v1, v2))  # 向量点乘
    denom = np.linalg.norm(v1) * np.linalg.norm(v2)  # 求模长的乘积
    return 0.5 + 0.5 * (num / denom) if denom != 0 else 0


if __name__ == '__main__':
    vecs = get_word_vec([["psw", "name"], ["git", "name"]])
    print(get_cos_similar(vecs[0][0], vecs[1][0]))