拉钩招聘网

以拉钩具体详情页为例,进行抓取

http://www.lagou.com/jobs/2101463.html

  1. from lxml import etree
  2. import requests
  3. import re
  4. response = requests.get('http://www.lagou.com/jobs/2101463.html')
  5. resHtml = response.text
  6. html = etree.HTML(resHtml)
  7. title = html.xpath('//h1[@title]')[0].attrib['title']
  8. #salary= html.xpath('//span[@class="red"]')[0].text
  9. salary = html.xpath('//dd[@class="job_request"]/p/span')[0].text
  10. worklocation = html.xpath('//dd[@class="job_request"]/p/span')[1].text
  11. experience = html.xpath('//dd[@class="job_request"]/p/span')[2].text
  12. education = html.xpath('//dd[@class="job_request"]/p/span')[3].text
  13. worktype = html.xpath('//dd[@class="job_request"]/p/span')[4].text
  14. Temptation = html.xpath('//dd[@class="job_request"]/p[2]')[0].text
  15. print salary,worklocation,experience,education,worktype,Temptation
  16. description_tag = html.xpath('//dd[@class="job_bt"]')[0]
  17. description = etree.tostring( description_tag,encoding='utf-8')
  18. #print description
  19. deal_descp = re.sub('<.*?>','',description)
  20. print deal_descp.strip()
  21. publisher_name = html.xpath('//*[@class="publisher_name"]//@title')[0]
  22. pos = html.xpath('//*[@class="pos"]')[0].text
  23. chuli_lv = html.xpath('//*[@class="data"]')[0].text
  24. chuli_yongshi = html.xpath('//*[@class="data"]')[1].text
  25. print chuli_lv,chuli_yongshi,pos,publisher_name