启动训练前,复用前面章节的数据处理和神经网络模型代码,已阅读可直接跳过。

    1. import random
    2. import numpy as np
    3. from PIL import Image
    4. import paddle.fluid as fluid
    5. import paddle.fluid.dygraph as dygraph
    6. from paddle.fluid.dygraph import Linear, Embedding, Conv2D, Pool2D
    7. class MovieLen(object):
    8. def __init__(self, use_poster):
    9. self.use_poster = use_poster
    10. # 声明每个数据文件的路径
    11. usr_info_path = "./work/ml-1m/users.dat"
    12. if use_poster:
    13. rating_path = "./work/ml-1m/new_rating.txt"
    14. else:
    15. rating_path = "./work/ml-1m/ratings.dat"
    16. movie_info_path = "./work/ml-1m/movies.dat"
    17. self.poster_path = "./work/ml-1m/posters/"
    18. # 得到电影数据
    19. self.movie_info, self.movie_cat, self.movie_title = self.get_movie_info(movie_info_path)
    20. # 记录电影的最大ID
    21. self.max_mov_cat = np.max([self.movie_cat[k] for k in self.movie_cat])
    22. self.max_mov_tit = np.max([self.movie_title[k] for k in self.movie_title])
    23. self.max_mov_id = np.max(list(map(int, self.movie_info.keys())))
    24. # 记录用户数据的最大ID
    25. self.max_usr_id = 0
    26. self.max_usr_age = 0
    27. self.max_usr_job = 0
    28. # 得到用户数据
    29. self.usr_info = self.get_usr_info(usr_info_path)
    30. # 得到评分数据
    31. self.rating_info = self.get_rating_info(rating_path)
    32. # 构建数据集
    33. self.dataset = self.get_dataset(usr_info=self.usr_info,
    34. rating_info=self.rating_info,
    35. movie_info=self.movie_info)
    36. # 划分数据集,获得数据加载器
    37. self.train_dataset = self.dataset[:int(len(self.dataset)*0.9)]
    38. self.valid_dataset = self.dataset[int(len(self.dataset)*0.9):]
    39. print("##Total dataset instances: ", len(self.dataset))
    40. print("##MovieLens dataset information: \nusr num: {}\n"
    41. "movies num: {}".format(len(self.usr_info),len(self.movie_info)))
    42. # 得到电影数据
    43. def get_movie_info(self, path):
    44. # 打开文件,编码方式选择ISO-8859-1,读取所有数据到data中
    45. with open(path, 'r', encoding="ISO-8859-1") as f:
    46. data = f.readlines()
    47. # 建立三个字典,分别用户存放电影所有信息,电影的名字信息、类别信息
    48. movie_info, movie_titles, movie_cat = {}, {}, {}
    49. # 对电影名字、类别中不同的单词计数
    50. t_count, c_count = 1, 1
    51. count_tit = {}
    52. # 按行读取数据并处理
    53. for item in data:
    54. item = item.strip().split("::")
    55. v_id = item[0]
    56. v_title = item[1][:-7]
    57. cats = item[2].split('|')
    58. v_year = item[1][-5:-1]
    59. titles = v_title.split()
    60. # 统计电影名字的单词,并给每个单词一个序号,放在movie_titles中
    61. for t in titles:
    62. if t not in movie_titles:
    63. movie_titles[t] = t_count
    64. t_count += 1
    65. # 统计电影类别单词,并给每个单词一个序号,放在movie_cat中
    66. for cat in cats:
    67. if cat not in movie_cat:
    68. movie_cat[cat] = c_count
    69. c_count += 1
    70. # 补0使电影名称对应的列表长度为15
    71. v_tit = [movie_titles[k] for k in titles]
    72. while len(v_tit)<15:
    73. v_tit.append(0)
    74. # 补0使电影种类对应的列表长度为6
    75. v_cat = [movie_cat[k] for k in cats]
    76. while len(v_cat)<6:
    77. v_cat.append(0)
    78. # 保存电影数据到movie_info中
    79. movie_info[v_id] = {'mov_id': int(v_id),
    80. 'title': v_tit,
    81. 'category': v_cat,
    82. 'years': int(v_year)}
    83. return movie_info, movie_cat, movie_titles
    84. def get_usr_info(self, path):
    85. # 性别转换函数,M-0, F-1
    86. def gender2num(gender):
    87. return 1 if gender == 'F' else 0
    88. # 打开文件,读取所有行到data中
    89. with open(path, 'r') as f:
    90. data = f.readlines()
    91. # 建立用户信息的字典
    92. use_info = {}
    93. max_usr_id = 0
    94. #按行索引数据
    95. for item in data:
    96. # 去除每一行中和数据无关的部分
    97. item = item.strip().split("::")
    98. usr_id = item[0]
    99. # 将字符数据转成数字并保存在字典中
    100. use_info[usr_id] = {'usr_id': int(usr_id),
    101. 'gender': gender2num(item[1]),
    102. 'age': int(item[2]),
    103. 'job': int(item[3])}
    104. self.max_usr_id = max(self.max_usr_id, int(usr_id))
    105. self.max_usr_age = max(self.max_usr_age, int(item[2]))
    106. self.max_usr_job = max(self.max_usr_job, int(item[3]))
    107. return use_info
    108. # 得到评分数据
    109. def get_rating_info(self, path):
    110. # 读取文件里的数据
    111. with open(path, 'r') as f:
    112. data = f.readlines()
    113. # 将数据保存在字典中并返回
    114. rating_info = {}
    115. for item in data:
    116. item = item.strip().split("::")
    117. usr_id,movie_id,score = item[0],item[1],item[2]
    118. if usr_id not in rating_info.keys():
    119. rating_info[usr_id] = {movie_id:float(score)}
    120. else:
    121. rating_info[usr_id][movie_id] = float(score)
    122. return rating_info
    123. # 构建数据集
    124. def get_dataset(self, usr_info, rating_info, movie_info):
    125. trainset = []
    126. for usr_id in rating_info.keys():
    127. usr_ratings = rating_info[usr_id]
    128. for movie_id in usr_ratings:
    129. trainset.append({'usr_info': usr_info[usr_id],
    130. 'mov_info': movie_info[movie_id],
    131. 'scores': usr_ratings[movie_id]})
    132. return trainset
    133. def load_data(self, dataset=None, mode='train'):
    134. use_poster = False
    135. # 定义数据迭代Batch大小
    136. BATCHSIZE = 256
    137. data_length = len(dataset)
    138. index_list = list(range(data_length))
    139. # 定义数据迭代加载器
    140. def data_generator():
    141. # 训练模式下,打乱训练数据
    142. if mode == 'train':
    143. random.shuffle(index_list)
    144. # 声明每个特征的列表
    145. usr_id_list,usr_gender_list,usr_age_list,usr_job_list = [], [], [], []
    146. mov_id_list,mov_tit_list,mov_cat_list,mov_poster_list = [], [], [], []
    147. score_list = []
    148. # 索引遍历输入数据集
    149. for idx, i in enumerate(index_list):
    150. # 获得特征数据保存到对应特征列表中
    151. usr_id_list.append(dataset[i]['usr_info']['usr_id'])
    152. usr_gender_list.append(dataset[i]['usr_info']['gender'])
    153. usr_age_list.append(dataset[i]['usr_info']['age'])
    154. usr_job_list.append(dataset[i]['usr_info']['job'])
    155. mov_id_list.append(dataset[i]['mov_info']['mov_id'])
    156. mov_tit_list.append(dataset[i]['mov_info']['title'])
    157. mov_cat_list.append(dataset[i]['mov_info']['category'])
    158. mov_id = dataset[i]['mov_info']['mov_id']
    159. if use_poster:
    160. # 不使用图像特征时,不读取图像数据,加快数据读取速度
    161. poster = Image.open(self.poster_path+'mov_id{}.jpg'.format(str(mov_id[0])))
    162. poster = poster.resize([64, 64])
    163. if len(poster.size) <= 2:
    164. poster = poster.convert("RGB")
    165. mov_poster_list.append(np.array(poster))
    166. score_list.append(int(dataset[i]['scores']))
    167. # 如果读取的数据量达到当前的batch大小,就返回当前批次
    168. if len(usr_id_list)==BATCHSIZE:
    169. # 转换列表数据为数组形式,reshape到固定形状
    170. usr_id_arr = np.array(usr_id_list)
    171. usr_gender_arr = np.array(usr_gender_list)
    172. usr_age_arr = np.array(usr_age_list)
    173. usr_job_arr = np.array(usr_job_list)
    174. mov_id_arr = np.array(mov_id_list)
    175. mov_cat_arr = np.reshape(np.array(mov_cat_list), [BATCHSIZE, 6]).astype(np.int64)
    176. mov_tit_arr = np.reshape(np.array(mov_tit_list), [BATCHSIZE, 1, 15]).astype(np.int64)
    177. if use_poster:
    178. mov_poster_arr = np.reshape(np.array(mov_poster_list)/127.5 - 1, [BATCHSIZE, 3, 64, 64]).astype(np.float32)
    179. else:
    180. mov_poster_arr = np.array([0.])
    181. scores_arr = np.reshape(np.array(score_list), [-1, 1]).astype(np.float32)
    182. # 放回当前批次数据
    183. yield [usr_id_arr, usr_gender_arr, usr_age_arr, usr_job_arr], \
    184. [mov_id_arr, mov_cat_arr, mov_tit_arr, mov_poster_arr], scores_arr
    185. # 清空数据
    186. usr_id_list, usr_gender_list, usr_age_list, usr_job_list = [], [], [], []
    187. mov_id_list, mov_tit_list, mov_cat_list, score_list = [], [], [], []
    188. mov_poster_list = []
    189. return data_generator
    190. class Model(dygraph.layers.Layer):
    191. def __init__(self, use_poster, use_mov_title, use_mov_cat, use_age_job):
    192. super(Model, self).__init__()
    193. # 将传入的name信息和bool型参数添加到模型类中
    194. self.use_mov_poster = use_poster
    195. self.use_mov_title = use_mov_title
    196. self.use_usr_age_job = use_age_job
    197. self.use_mov_cat = use_mov_cat
    198. # 获取数据集的信息,并构建训练和验证集的数据迭代器
    199. Dataset = MovieLen(self.use_mov_poster)
    200. self.Dataset = Dataset
    201. self.trainset = self.Dataset.train_dataset
    202. self.valset = self.Dataset.valid_dataset
    203. self.train_loader = self.Dataset.load_data(dataset=self.trainset, mode='train')
    204. self.valid_loader = self.Dataset.load_data(dataset=self.valset, mode='valid')
    205. """ define network layer for embedding usr info """
    206. USR_ID_NUM = Dataset.max_usr_id + 1
    207. # 对用户ID做映射,并紧接着一个Linear层
    208. self.usr_emb = Embedding([USR_ID_NUM, 32], is_sparse=False)
    209. self.usr_fc = Linear(32, 32)
    210. # 对用户性别信息做映射,并紧接着一个Linear层
    211. USR_GENDER_DICT_SIZE = 2
    212. self.usr_gender_emb = Embedding([USR_GENDER_DICT_SIZE, 16])
    213. self.usr_gender_fc = Linear(16, 16)
    214. # 对用户年龄信息做映射,并紧接着一个Linear层
    215. USR_AGE_DICT_SIZE = Dataset.max_usr_age + 1
    216. self.usr_age_emb = Embedding([USR_AGE_DICT_SIZE, 16])
    217. self.usr_age_fc = Linear(16, 16)
    218. # 对用户职业信息做映射,并紧接着一个Linear层
    219. USR_JOB_DICT_SIZE = Dataset.max_usr_job + 1
    220. self.usr_job_emb = Embedding([USR_JOB_DICT_SIZE, 16])
    221. self.usr_job_fc = Linear(16, 16)
    222. # 新建一个Linear层,用于整合用户数据信息
    223. self.usr_combined = Linear(80, 200, act='tanh')
    224. """ define network layer for embedding usr info """
    225. # 对电影ID信息做映射,并紧接着一个Linear层
    226. MOV_DICT_SIZE = Dataset.max_mov_id + 1
    227. self.mov_emb = Embedding([MOV_DICT_SIZE, 32])
    228. self.mov_fc = Linear(32, 32)
    229. # 对电影类别做映射
    230. CATEGORY_DICT_SIZE = len(Dataset.movie_cat) + 1
    231. self.mov_cat_emb = Embedding([CATEGORY_DICT_SIZE, 32], is_sparse=False)
    232. self.mov_cat_fc = Linear(32, 32)
    233. # 对电影名称做映射
    234. MOV_TITLE_DICT_SIZE = len(Dataset.movie_title) + 1
    235. self.mov_title_emb = Embedding([MOV_TITLE_DICT_SIZE, 32], is_sparse=False)
    236. self.mov_title_conv = Conv2D(1, 1, filter_size=(3, 1), stride=(2,1), padding=0, act='relu')
    237. self.mov_title_conv2 = Conv2D(1, 1, filter_size=(3, 1), stride=1, padding=0, act='relu')
    238. # 新建一个Linear层,用于整合电影特征
    239. self.mov_concat_embed = Linear(96, 200, act='tanh')
    240. # 定义计算用户特征的前向运算过程
    241. def get_usr_feat(self, usr_var):
    242. """ get usr features"""
    243. # 获取到用户数据
    244. usr_id, usr_gender, usr_age, usr_job = usr_var
    245. # 将用户的ID数据经过embedding和Linear计算,得到的特征保存在feats_collect中
    246. feats_collect = []
    247. usr_id = self.usr_emb(usr_id)
    248. usr_id = self.usr_fc(usr_id)
    249. usr_id = fluid.layers.relu(usr_id)
    250. feats_collect.append(usr_id)
    251. # 计算用户的性别特征,并保存在feats_collect中
    252. usr_gender = self.usr_gender_emb(usr_gender)
    253. usr_gender = self.usr_gender_fc(usr_gender)
    254. usr_gender = fluid.layers.relu(usr_gender)
    255. feats_collect.append(usr_gender)
    256. # 选择是否使用用户的年龄-职业特征
    257. if self.use_usr_age_job:
    258. # 计算用户的年龄特征,并保存在feats_collect中
    259. usr_age = self.usr_age_emb(usr_age)
    260. usr_age = self.usr_age_fc(usr_age)
    261. usr_age = fluid.layers.relu(usr_age)
    262. feats_collect.append(usr_age)
    263. # 计算用户的职业特征,并保存在feats_collect中
    264. usr_job = self.usr_job_emb(usr_job)
    265. usr_job = self.usr_job_fc(usr_job)
    266. usr_job = fluid.layers.relu(usr_job)
    267. feats_collect.append(usr_job)
    268. # 将用户的特征级联,并通过Linear层得到最终的用户特征
    269. usr_feat = fluid.layers.concat(feats_collect, axis=1)
    270. usr_feat = self.usr_combined(usr_feat)
    271. return usr_feat
    272. # 定义电影特征的前向计算过程
    273. def get_mov_feat(self, mov_var):
    274. """ get movie features"""
    275. # 获得电影数据
    276. mov_id, mov_cat, mov_title, mov_poster = mov_var
    277. feats_collect = []
    278. # 获得batchsize的大小
    279. batch_size = mov_id.shape[0]
    280. # 计算电影ID的特征,并存在feats_collect中
    281. mov_id = self.mov_emb(mov_id)
    282. mov_id = self.mov_fc(mov_id)
    283. mov_id = fluid.layers.relu(mov_id)
    284. feats_collect.append(mov_id)
    285. # 如果使用电影的种类数据,计算电影种类特征的映射
    286. if self.use_mov_cat:
    287. # 计算电影种类的特征映射,对多个种类的特征求和得到最终特征
    288. mov_cat = self.mov_cat_emb(mov_cat)
    289. mov_cat = fluid.layers.reduce_sum(mov_cat, dim=1, keep_dim=False)
    290. mov_cat = self.mov_cat_fc(mov_cat)
    291. feats_collect.append(mov_cat)
    292. if self.use_mov_title:
    293. # 计算电影名字的特征映射,对特征映射使用卷积计算最终的特征
    294. mov_title = self.mov_title_emb(mov_title)
    295. mov_title = self.mov_title_conv2(self.mov_title_conv(mov_title))
    296. mov_title = fluid.layers.reduce_sum(mov_title, dim=2, keep_dim=False)
    297. mov_title = fluid.layers.relu(mov_title)
    298. mov_title = fluid.layers.reshape(mov_title, [batch_size, -1])
    299. feats_collect.append(mov_title)
    300. # 使用一个全连接层,整合所有电影特征,映射为一个200维的特征向量
    301. mov_feat = fluid.layers.concat(feats_collect, axis=1)
    302. mov_feat = self.mov_concat_embed(mov_feat)
    303. return mov_feat
    304. # 定义个性化推荐算法的前向计算
    305. def forward(self, usr_var, mov_var):
    306. # 计算用户特征和电影特征
    307. usr_feat = self.get_usr_feat(usr_var)
    308. mov_feat = self.get_mov_feat(mov_var)
    309. # 根据计算的特征计算相似度
    310. res = fluid.layers.cos_sim(usr_feat, mov_feat)
    311. # 将相似度扩大范围到和电影评分相同数据范围
    312. res = fluid.layers.scale(res, scale=5)
    313. return usr_feat, mov_feat, res
    1. # 解压数据集
    2. !cd work && unzip -o -q ml-1m.zip