腾讯招聘

http://hr.tencent.com/position.php

items.py:添加以下代码

  1. from scrapy.item import Item, Field
  2. class TencentItem(Item):
  3. title = Field()
  4. catalog = Field()
  5. workLocation = Field()
  6. recruitNumber = Field()
  7. duty = Field()
  8. Job_requirement= Field()
  9. url = Field()
  10. publishTime = Field()

在spiders目录下新建一个自定义tencent_info.py

  1. # -*- coding:utf-8 -*-
  2. from scrapy.selector import Selector
  3. from scrapy.linkextractors import LinkExtractor
  4. from scrapy.spiders import CrawlSpider, Rule
  5. import re
  6. from tutorial.items import TencentItem
  7. class TencentSpider(CrawlSpider):
  8. name = "tengxun_info"
  9. allowed_domains = ["tencent.com"]
  10. start_urls = [
  11. "http://hr.tencent.com/position.php"
  12. ]
  13. rules = [
  14. Rule(LinkExtractor(allow=("start=\d+"))),
  15. Rule(LinkExtractor(allow=("position_detail\.php")), follow=True, callback='parse_item')
  16. ]
  17. def parse_item(self,response):
  18. item =TencentItem()
  19. title = response.xpath('//*[@id="sharetitle"]/text()')[0].extract()
  20. workLocation = response.xpath('//*[@class="lightblue l2"]/../text()')[0].extract()
  21. catalog = response.xpath('//*[@class="lightblue"]/../text()')[0].extract()
  22. recruitNumber = response.xpath('//*[@class="lightblue"]/../text()').re('(\d+)')[0]
  23. duty_pre = response.xpath('//*[@class="squareli"]')[0].extract()
  24. duty = re.sub('<.*?>','',duty_pre)
  25. Job_requirement_pre = response.xpath('//*[@class="squareli"]')[1].extract()
  26. Job_requirement = re.sub('<.*?>','',Job_requirement_pre)
  27. item['title']=title
  28. item['url']=response.url
  29. item['workLocation']=workLocation
  30. item['catalog']=catalog
  31. item['recruitNumber']=recruitNumber
  32. item['duty']=duty
  33. item['Job_requirement']=Job_requirement
  34. yield item

在pipelines.py:添加如下代码

  1. import json
  2. import codecs
  3. class JsonWriterPipeline(object):
  4. def __init__(self):
  5. self.file = codecs.open('items.json', 'w', encoding='utf-8')
  6. def process_item(self, item, spider):
  7. line = json.dumps(dict(item), ensure_ascii=False) + "\n"
  8. self.file.write(line)
  9. return item
  10. def spider_closed(self, spider):
  11. self.file.close()

settings.py:添加如下代码(启用组件)

  1. ITEM_PIPELINES = {
  2. 'tutorial.pipelines.JsonWriterPipeline': 300,
  3. }

在项目根目录下新建main.py文件,用于调试

  1. from scrapy import cmdline
  2. cmdline.execute('scrapy crawl tengxun_info'.split())