scrapy自动登录爬取数据

scrapy自动登录爬取数据WKINFO数据.

#!/usr/bin/python
# -*- coding: utf-8 -*-
import scrapy
import json
import re
import urllib
from scrapy.spider import BaseSpider, Rule
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.selector import HtmlXPathSelector
from scrapy.linkextractors import LinkExtractor
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from utils.MyBase import MyConfig

# 加载配置文件
cf=MyConfig()

class TorrentItem(scrapy.Item):
    url = scrapy.Field()
    url_id = scrapy.Field()
    name = scrapy.Field()
    fwjg = scrapy.Field()
    effect = scrapy.Field()
    starttime = scrapy.Field()
    endtime = scrapy.Field()
    wenhao = scrapy.Field()
    bianji = scrapy.Field()
    content = scrapy.Field()


class DmozSpider(BaseSpider):
    name = "wk"

    allowed_domains=['hr.wkinfo.com.cn']
    start_urls = [
            cf["settings"]["url"]
    ]


    formdata = {
        'username': cf["HR"]["username"],
        'password_text': '密码',
        'password': cf["HR"]["password"],
        '_autologin': ''
    }


    def set_space(self, content, st='list'):
        ''' 替换多余的换行与空格 '''
        data=[]
        if st == 'str':
            return "".join(content.split())

        for row in content:
            row=row.strip()
            if row == '':
                continue
            data.append("".join(row.split()))
        return data
        

    def up_cookie(self, response):
        yield scrapy.Request(cf["settings"]["url"])

    def aft_login(self, response):
        ''' 登陆后需要请求这个跳转,否则得不到数据 '''
        yield scrapy.Request('http://hr.wkinfo.com.cn/boldUsers', callback=self.up_cookie)

    def parse_login(self, response):
        ''' 如果等于1可以正常POST登陆 '''
        _limits=json.loads(response.body)
        if _limits["meg"] == "1":
            yield scrapy.FormRequest('http://hr.wkinfo.com.cn/boldUsers/checkValidate', formdata=self.formdata, callback=self.aft_login)
        else:
            print _limits["meg"],'-----!'

    def start_requests(self):
        ''' 登录验证判断是否登陆限制 '''
        return [scrapy.FormRequest(url='http://hr.wkinfo.com.cn/boldUsers/ajaxLoginValidate',
            formdata=self.formdata,
            callback=self.parse_login)]

    def parse(self, response):
        wkinfo = TorrentItem()
	url_id=response.xpath("//a[@class='resultTitle']/@href").extract()
        wkinfo['url_id']=response.url

        for url in url_id:
            print '{}----'.format(url)
            yield scrapy.Request('http://hr.wkinfo.com.cn' + url, meta={'item':wkinfo}, callback=self.parse_content)

        print 'exit------'
        yield scrapy.Request("http://hr.wkinfo.com.cn/boldUsers/exitbold", callback=self.logout)

    def logout(self, response):
        ''' 网站只允许登陆一次,所以处理完成后,请退出,否则需要等好几个小时 '''
        print 'out-------'
      
    def parse_content(self, response):
        wkinfo=response.meta['item']
        #wkinfo = TorrentItem()
        wkinfo['url'] = urllib.unquote(response.url)
        wkinfo['name'] = response.xpath("////h1[@class='biao']/text()").extract()
        wkinfo['fwjg'] = response.xpath("////span[@id='cchcnpromulgatingagencyid']/a/text()").extract()
        wkinfo['starttime'] = response.xpath("////span[@id='cchcnpromulgatingdate']/text()").extract()
        wkinfo['endtime'] = response.xpath("////span[@id='cchcneffectivedate']/text()").extract()
        wkinfo['wenhao'] = response.xpath("////span[@id='cchcndocumentnumber']/text()").extract()
        wkinfo['bianji'] = response.xpath("////span[@id='cchcnmetanote']/text()").extract()
        wkinfo['effect'] = response.xpath("////span[@id='cchcnvaliditystatusid']/text()").extract()
        wkinfo['content'] = response.xpath("//div[@class='faguicon']").extract()
        yield wkinfo