Exercise: Web Crawler

In this exercise you'll use Go's concurrency features to parallelize a web crawler.

Modify the Crawl function to fetch URLs in parallel without fetching the same URL twice.

Hint: you can keep a cache of the URLs that have been fetched on a map, but maps alone are not safe for concurrent use!

exercise-web-crawler.go

  1. package main
  2. import (
  3. "fmt"
  4. )
  5. type Fetcher interface {
  6. // Fetch returns the body of URL and
  7. // a slice of URLs found on that page.
  8. Fetch(url string) (body string, urls []string, err error)
  9. }
  10. // Crawl uses fetcher to recursively crawl
  11. // pages starting with url, to a maximum of depth.
  12. func Crawl(url string, depth int, fetcher Fetcher) {
  13. // TODO: Fetch URLs in parallel.
  14. // TODO: Don't fetch the same URL twice.
  15. // This implementation doesn't do either:
  16. if depth <= 0 {
  17. return
  18. }
  19. body, urls, err := fetcher.Fetch(url)
  20. if err != nil {
  21. fmt.Println(err)
  22. return
  23. }
  24. fmt.Printf("found: %s %q\n", url, body)
  25. for _, u := range urls {
  26. Crawl(u, depth-1, fetcher)
  27. }
  28. return
  29. }
  30. func main() {
  31. Crawl("https://golang.org/", 4, fetcher)
  32. }
  33. // fakeFetcher is Fetcher that returns canned results.
  34. type fakeFetcher map[string]*fakeResult
  35. type fakeResult struct {
  36. body string
  37. urls []string
  38. }
  39. func (f fakeFetcher) Fetch(url string) (string, []string, error) {
  40. if res, ok := f[url]; ok {
  41. return res.body, res.urls, nil
  42. }
  43. return "", nil, fmt.Errorf("not found: %s", url)
  44. }
  45. // fetcher is a populated fakeFetcher.
  46. var fetcher = fakeFetcher{
  47. "https://golang.org/": &fakeResult{
  48. "The Go Programming Language",
  49. []string{
  50. "https://golang.org/pkg/",
  51. "https://golang.org/cmd/",
  52. },
  53. },
  54. "https://golang.org/pkg/": &fakeResult{
  55. "Packages",
  56. []string{
  57. "https://golang.org/",
  58. "https://golang.org/cmd/",
  59. "https://golang.org/pkg/fmt/",
  60. "https://golang.org/pkg/os/",
  61. },
  62. },
  63. "https://golang.org/pkg/fmt/": &fakeResult{
  64. "Package fmt",
  65. []string{
  66. "https://golang.org/",
  67. "https://golang.org/pkg/",
  68. },
  69. },
  70. "https://golang.org/pkg/os/": &fakeResult{
  71. "Package os",
  72. []string{
  73. "https://golang.org/",
  74. "https://golang.org/pkg/",
  75. },
  76. },
  77. }