博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
【笔记3】用pandas实现矩阵数据格式的推荐算法 (基于用户的协同)
阅读量:7066 次
发布时间:2019-06-28

本文共 6213 字,大约阅读时间需要 20 分钟。

原书作者使用字典dict实现推荐算法,并且惊叹于18行代码实现了向量的余弦夹角公式。

我用pandas实现相同的公式只要3行。

特别说明:本篇笔记是针对矩阵数据,下篇笔记是针对条目数据。

'''基于用户的协同推荐矩阵数据'''import pandas as pdfrom io import StringIOimport json#数据类型一:csv矩阵(用户-商品)(适用于小数据量)csv_txt = '''"user","Blues Traveler","Broken Bells","Deadmau5","Norah Jones","Phoenix","Slightly Stoopid","The Strokes","Vampire Weekend""Angelica",3.5,2.0,,4.5,5.0,1.5,2.5,2.0"Bill",2.0,3.5,4.0,,2.0,3.5,,3.0"Chan",5.0,1.0,1.0,3.0,5,1.0,,"Dan",3.0,4.0,4.5,,3.0,4.5,4.0,2.0"Hailey",,4.0,1.0,4.0,,,4.0,1.0"Jordyn",,4.5,4.0,5.0,5.0,4.5,4.0,4.0"Sam",5.0,2.0,,3.0,5.0,4.0,5.0,"Veronica",3.0,,,5.0,4.0,2.5,3.0,'''#数据类型二:json数据(用户、商品、打分)json_txt = '''{"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0,                      "Norah Jones": 4.5, "Phoenix": 5.0,                      "Slightly Stoopid": 1.5,                      "The Strokes": 2.5, "Vampire Weekend": 2.0},                  "Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5,                 "Deadmau5": 4.0, "Phoenix": 2.0,                 "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0},                  "Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0,                  "Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5,                  "Slightly Stoopid": 1.0},                  "Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0,                 "Deadmau5": 4.5, "Phoenix": 3.0,                 "Slightly Stoopid": 4.5, "The Strokes": 4.0,                 "Vampire Weekend": 2.0},                  "Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0,                    "Norah Jones": 4.0, "The Strokes": 4.0,                    "Vampire Weekend": 1.0},                  "Jordyn":  {"Broken Bells": 4.5, "Deadmau5": 4.0,                     "Norah Jones": 5.0, "Phoenix": 5.0,                     "Slightly Stoopid": 4.5, "The Strokes": 4.0,                     "Vampire Weekend": 4.0},                  "Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0,                 "Norah Jones": 3.0, "Phoenix": 5.0,                 "Slightly Stoopid": 4.0, "The Strokes": 5.0},                  "Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0,                      "Phoenix": 4.0, "Slightly Stoopid": 2.5,                      "The Strokes": 3.0}}'''df = None#方式一:加载csv数据def load_csv_txt():    global df    df = pd.read_csv(StringIO(csv_txt), header=0, index_col="user")#方式二:加载json数据(把json读成矩阵)def load_json_txt():    global df    df = pd.read_json(json_txt, orient='index')        #测试:读取数据load_csv_txt()#load_json_txt()def build_xy(user_name1, user_name2):    #df2 = df.ix[[user_name1, user_name2]].dropna(axis=1)    #return df2.ix[user_name1], df2.ix[user_name2]        bool_array = df.ix[user_name1].notnull() & df.ix[user_name2].notnull()    return df.ix[user_name1, bool_array], df.ix[user_name2, bool_array]#曼哈顿距离def manhattan(user_name1, user_name2):    x, y = build_xy(user_name1, user_name2)    return sum(abs(x - y))    #欧几里德距离def euclidean(user_name1, user_name2):    x, y = build_xy(user_name1, user_name2)    return sum((x - y)**2)**0.5    #闵可夫斯基距离def minkowski(user_name1, user_name2, r):    x, y = build_xy(user_name1, user_name2)    return sum(abs(x - y)**r)**(1/r)    #皮尔逊相关系数def pearson(user_name1, user_name2):    x, y = build_xy(user_name1, user_name2)    mean1, mean2 = x.mean(), y.mean()    #分母    denominator = (sum((x-mean1)**2)*sum((y-mean2)**2))**0.5    return [sum((x-mean1)*(y-mean2))/denominator, 0][denominator == 0]    #余弦相似度(数据的稀疏性问题,在文本挖掘中应用得较多)def cosine(user_name1, user_name2):    x, y = build_xy(user_name1, user_name2)    #分母    denominator = (sum(x*x)*sum(y*y))**0.5    return [sum(x*y)/denominator, 0][denominator == 0]metric_funcs = {    'manhattan': manhattan,    'euclidean': euclidean,    'minkowski': minkowski,    'pearson': pearson,    'cosine': cosine}#df.ix[["Angelica","Bill"]].dropna(axis=1)print(manhattan("Angelica","Bill"))#计算最近的邻居def computeNearestNeighbor(user_name, metric='pearson', k=3, r=2):    '''    metric: 度量函数    k:      返回k个邻居    r:      闵可夫斯基距离专用        返回:pd.Series,其中index是邻居名称,values是距离    '''    if metric in ['manhattan', 'euclidean']:        return df.drop(user_name).index.to_series().apply(metric_funcs[metric], args=(user_name,)).nsmallest(k)    elif metric in ['minkowski']:        return df.drop(user_name).index.to_series().apply(metric_funcs[metric], args=(user_name, r,)).nsmallest(k)    elif metric in ['pearson', 'cosine']:        return df.drop(user_name).index.to_series().apply(metric_funcs[metric], args=(user_name,)).nlargest(k)    print(computeNearestNeighbor('Hailey', metric='pearson'))#向给定用户推荐(返回:pd.Series)def recommend(user_name):    # 找到距离最近的用户名    nearest_username = computeNearestNeighbor(user_name).index[0]        # 找出邻居评价过、但自己未曾评价的乐队(或商品)    # 结果:index是商品名称,values是评分    return df.ix[nearest_username, df.ix[user_name].isnull() & df.ix[nearest_username].notnull()].sort_values()#为Hailey做推荐print(recommend('Hailey'))#向给定用户推荐def recommend2(user_name, metric='pearson', k=3, n=5, r=2):    '''    metric: 度量函数    k:      根据k个最近邻居,协同推荐    r:      闵可夫斯基距离专用    n:      推荐的商品数目        返回:pd.Series,其中index是商品名称,values是加权评分    '''    # 找到距离最近的k个邻居    nearest_neighbors = computeNearestNeighbor(user_name, metric='pearson', k=k, r=r)        # 计算权值    if metric in ['manhattan', 'euclidean', 'minkowski']: # 距离越小,越类似        nearest_neighbors = 1 / nearest_neighbors # 所以,取倒数(或者别的减函数,如:y=2**-x)    elif metric in ['pearson', 'cosine']:                 # 距离越大,越类似        pass            nearest_neighbors = nearest_neighbors / nearest_neighbors.sum() #已经变为权值(pd.Series)        # 逐个邻居找出其评价过、但自己未曾评价的乐队(或商品)的评分,并乘以权值    neighbors_rate_with_weight = []    for neighbor_name in nearest_neighbors.index:        # 每个结果:pd.Series,其中index是商品名称,values是评分(已乘权值)        neighbors_rate_with_weight.append(df.ix[neighbor_name, df.ix[user_name].isnull() & df.ix[neighbor_name].notnull()] * nearest_neighbors[neighbor_name])    # 把邻居们的加权评分拼接成pd.DataFrame,按列累加,取最大的前n个商品的评分    return pd.concat(neighbors_rate_with_weight, axis=1).sum(axis=1, skipna=True).nlargest(n)    #为Hailey做推荐print(recommend2('Hailey', metric='manhattan', k=3, n=5))#为Hailey做推荐print(recommend2('Hailey', metric='euclidean', k=3, n=5, r=2))#为Hailey做推荐print(recommend2('Hailey', metric='pearson', k=1, n=5))

转载地址:http://yaall.baihongyu.com/

你可能感兴趣的文章
Python——dummy_thread( _dummy_thread in Python 3.+)
查看>>
关于逻辑运算符书写效率问题 和数组 处理问题
查看>>
Performing a full database disaster recovery with RMAN
查看>>
Linux在本地使用yum安装软件(转)
查看>>
第5章 字符串----判断字符串是否相等
查看>>
javascript中遇到的字符串对象处理
查看>>
PHP GD 生成图片验证码+session获取储存验证码
查看>>
【web开发学习笔记】Structs2 Result学习笔记(一)简介
查看>>
android studio中取消关联git
查看>>
Mysql的共享锁和排他锁(转载)
查看>>
Effective C++--经验条款
查看>>
vue 项目中 自定义 webpack 的 配置文件(webpack.config.babel.js)
查看>>
考虑使用jruby
查看>>
深挖洞,广积粮,不称霸
查看>>
执行计划组件、组件、老化
查看>>
分享几个Tooltips插件
查看>>
初探Object Pascal的类(一)
查看>>
Android RenderScript 的使用基础篇
查看>>
MyEclipse6.0.1中SSH项目的配置 (转)
查看>>
HDOJ-1399 Starship Hakodate-maru
查看>>