阳光热线问政平台

http://wz.sun0769.com/index.php/question/questionType?type=4

items.py:添加以下代码

  1. from scrapy.item import Item, Field
  2. class SunItem(Item):
  3. number = Field()
  4. url = Field()
  5. title = Field()
  6. content = Field()

在spiders目录下新建一个自定义SunSpider.py

  1. from scrapy.spiders import CrawlSpider, Rule
  2. from scrapy.linkextractors import LinkExtractor
  3. from tutorial.items import SunItem
  4. import scrapy
  5. import urllib
  6. import time
  7. import re
  8. class SunSpider(CrawlSpider):
  9. name = 'sun0769'
  10. num = 0
  11. allow_domain = ['http://wz.sun0769.com/']
  12. start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4'
  13. ]
  14. rules = {
  15. Rule(LinkExtractor(allow='page'), process_links='process_request', follow=True),
  16. Rule(LinkExtractor(allow='/html/question/\d+/\d+\.shtml$'), callback='parse_content')
  17. }
  18. def process_request(self, links):
  19. ret=[]
  20. for link in links:
  21. try:
  22. page = re.search('page=\d*', link.url).group()
  23. type = re.search('type=\d+', link.url).group()
  24. link.url ='http://wz.sun0769.com/index.php/question/questionType?' + page + "&" + type
  25. except Exception, e:
  26. pass
  27. ret.append(link)
  28. return ret
  29. def parse_content(self, response):
  30. item = SunItem()
  31. url = response.url
  32. title = response.xpath('//*[@class="greyframe"]/div/div/strong/text()')[0].extract().strip()
  33. number = response.xpath('//*[@class="greyframe"]/div/div/strong/text()')[0].extract().strip().split(':')[-1]
  34. content = response.xpath('//div[@class="c1 text14_2"]/text()').extract()[0].strip()
  35. item['url'] = url
  36. item['title'] = title
  37. item['number'] = number
  38. item['content'] = content
  39. yield item

在pipelines.py:添加如下代码

  1. import json
  2. import codecs
  3. class JsonWriterPipeline(object):
  4. def __init__(self):
  5. self.file = codecs.open('items.json', 'w', encoding='utf-8')
  6. def process_item(self, item, spider):
  7. line = json.dumps(dict(item), ensure_ascii=False) + "\n"
  8. self.file.write(line)
  9. return item
  10. def spider_closed(self, spider):
  11. self.file.close()

settings.py:添加如下代码(启用组件)

  1. ITEM_PIPELINES = {
  2. 'tutorial.pipelines.JsonWriterPipeline': 300,
  3. }

window 下调试

在项目根目录下新建main.py文件,用于调试

  1. from scrapy import cmdline
  2. cmdline.execute('scrapy crawl sun0769'.split())