LiuFan
/
PrivacyScanData


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
							# author: Justin Cui
# date: 2019/10/23
# email: 321923502@qq.com


from numpy import *


def load_data():
    dataSet = [['bread', 'milk', 'vegetable', 'fruit', 'eggs'],
               ['noodle', 'beef', 'pork', 'water', 'socks', 'gloves', 'shoes', 'rice'],
               ['socks', 'gloves'],
               ['bread', 'milk', 'shoes', 'socks', 'eggs'],
               ['socks', 'shoes', 'sweater', 'cap', 'milk', 'vegetable', 'gloves'],
               ['eggs', 'bread', 'milk', 'fish', 'crab', 'shrimp', 'rice']]
    return dataSet


# 扫描全部数据，产生c1
def create_c1(data):
    c1 = []
    for transaction in data:
        for item in transaction:
            if [item] not in c1:
                c1.append([item])
    c1.sort()
    return list(map(frozenset, c1))


# 由c（i）生成对应的l（i）
def c2l(data, ck, min_support):
    dict_sup = {}
    for i in data:
        for j in ck:
            if j.issubset(i):
                if j not in dict_sup:
                    dict_sup[j] = 1
                else:
                    dict_sup[j] += 1
    support_data = {}
    result_list = []
    for i in dict_sup:
        temp_sup = dict_sup[i] / len(data)
        if temp_sup >= min_support:
            result_list.append(i)
            support_data[i] = temp_sup
    return result_list, support_data


# 由l（k-1）生成c（k）
def get_next_c(Lk, k):
    result_list = []
    len_lk = len(Lk)
    for i in range(len_lk):
        for j in range(i + 1, len_lk):
            l1 = list(Lk[i])[:k]
            l2 = list(Lk[j])[:k]
            if l1 == l2:
                a = Lk[i] | Lk[j]
                a1 = list(a)
                b = []
                for q in range(len(a1)):
                    t = [a1[q]]
                    tt = frozenset(set(a1) - set(t))
                    b.append(tt)
                t = 0
                for w in b:
                    if w in Lk:
                        t += 1
                if t == len(b):
                    result_list.append(b[0] | b[1])
    return result_list


# 得到所有的l集
def get_all_l(data_set, min_support):
    c1 = create_c1(data_set)
    data = list(map(set, data_set))
    L1, support_data = c2l(data, c1, min_support)
    L = [L1]
    k = 2
    while (len(L[k - 2]) > 0):
        Ck = get_next_c(L[k - 2], k - 2)
        Lk, sup = c2l(data, Ck, min_support)
        support_data.update(sup)
        L.append(Lk)
        k += 1
    del L[-1]
    return L, support_data


# 得到所有L集的子集
def get_subset(from_list, result_list):
    for i in range(len(from_list)):
        t = [from_list[i]]
        tt = frozenset(set(from_list) - set(t))
        if tt not in result_list:
            result_list.append(tt)
            tt = list(tt)
            if len(tt) > 1:
                get_subset(tt, result_list)


# 计算置信度
def calc_conf(freqSet, H, supportData, min_conf):
    for conseq in H:
        conf = supportData[freqSet] / supportData[freqSet - conseq]
        lift = supportData[freqSet] / (supportData[conseq] * supportData[freqSet - conseq])
        if conf >= min_conf and lift > 1:
            print(set(freqSet - conseq), '-->', set(conseq), '支持度', round(supportData[freqSet - conseq], 2), '置信度：',
                  conf)


# 生成规则
def gen_rule(L, support_data, min_conf=0.7):
    for i in range(len(L)):
        print("\n", i + 1, "-频繁项集为：")
        for freqSet in L[i]:
            print(set(freqSet), end="  ")
    print("\n")
    for i in range(1, len(L)):
        for freqSet in L[i]:
            H1 = list(freqSet)
            all_subset = []
            get_subset(H1, all_subset)
            calc_conf(freqSet, all_subset, support_data, min_conf)


if __name__ == '__main__':
    dataSet = load_data()
    L, supportData = get_all_l(dataSet, 0.5)
    gen_rule(L, supportData, 0.6)