例子

假定我们想从一些文本文件中构建一个图,限制这个图包含重要的关系和用户,并且在子图上运行page-rank,最后返回与top用户相关的属性。可以通过如下方式实现。

  1. // Connect to the Spark cluster
  2. val sc = new SparkContext("spark://master.amplab.org", "research")
  3. // Load my user data and parse into tuples of user id and attribute list
  4. val users = (sc.textFile("graphx/data/users.txt")
  5. .map(line => line.split(",")).map( parts => (parts.head.toLong, parts.tail) ))
  6. // Parse the edge data which is already in userId -> userId format
  7. val followerGraph = GraphLoader.edgeListFile(sc, "graphx/data/followers.txt")
  8. // Attach the user attributes
  9. val graph = followerGraph.outerJoinVertices(users) {
  10. case (uid, deg, Some(attrList)) => attrList
  11. // Some users may not have attributes so we set them as empty
  12. case (uid, deg, None) => Array.empty[String]
  13. }
  14. // Restrict the graph to users with usernames and names
  15. val subgraph = graph.subgraph(vpred = (vid, attr) => attr.size == 2)
  16. // Compute the PageRank
  17. val pagerankGraph = subgraph.pageRank(0.001)
  18. // Get the attributes of the top pagerank users
  19. val userInfoWithPageRank = subgraph.outerJoinVertices(pagerankGraph.vertices) {
  20. case (uid, attrList, Some(pr)) => (pr, attrList.toList)
  21. case (uid, attrList, None) => (0.0, attrList.toList)
  22. }
  23. println(userInfoWithPageRank.vertices.top(5)(Ordering.by(_._2._1)).mkString("\n"))