openapi: 3.0.3
info:
  title: LOCKSS Crawler Service REST API
  description: REST API of the LOCKSS Crawler Service
  contact:
    name: LOCKSS Support
    url: https://www.lockss.org/
    email: lockss-support@lockss.org
  license:
    name: BSD-3-Clause
    url: https://opensource.org/licenses/BSD-3-Clause
  version: 2.0.0
servers:
- url: https://laaws.lockss.org:443/
security:
- basicAuth: []
tags:
- name: crawls
  description: requests related to crawls
- name: crawlers
  description: requests related to crawlers
- name: jobs
  description: requests related to crawl jobs
- name: ws
  description: legacy SOAP endpoint
paths:
  /crawlers:
    get:
      tags:
      - crawlers
      summary: Get the list of supported crawlers.
      description: Return the list of supported crawlers.
      operationId: getCrawlers
      responses:
        200:
          description: The Status of supported Crawlers.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/crawlerStatuses'
        default:
          description: The resulting error payload.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/errorResult'
  /crawlers/{crawlerId}:
    get:
      tags:
      - crawlers
      summary: Return information about a crawler.
      description: Get information related to a installed crawler.
      operationId: getCrawlerConfig
      parameters:
      - name: crawlerId
        in: path
        description: Identifier for the crawler
        required: true
        schema:
          type: string
      responses:
        200:
          description: Crawler Configuration Found
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/crawlerConfig'
        default:
          description: The resulting error payload.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/errorResult'
  /crawls:
    get:
      tags:
      - crawls
      summary: Get the list of crawls.
      description: Get a list of crawls a pageful at a time as defined by limit.
      operationId: getCrawls
      parameters:
      - name: limit
        in: query
        description: The number of jobs per page
        schema:
          type: integer
          default: 50
      - name: continuationToken
        in: query
        description: The continuation token of the next page of crawl status data
          to be returned.
        schema:
          type: string
      responses:
        200:
          description: The requested crawls
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/crawlPager'
        default:
          description: The resulting error payload.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/errorResult'
  /crawls/{jobId}:
    get:
      tags:
      - crawls
      summary: Get the crawl status of this job
      description: Get the job represented by this crawl id
      operationId: getCrawlById
      parameters:
      - name: jobId
        in: path
        required: true
        schema:
          type: string
      responses:
        200:
          description: The crawl status of the requested crawl
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/crawlStatus'
        default:
          description: The resulting error payload.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/errorResult'
  /crawls/{jobId}/fetched:
    get:
      tags:
      - crawls
      summary: A pageable list of fetched urls.
      description: Get a list of fetched urls.
      operationId: getCrawlFetched
      parameters:
      - name: jobId
        in: path
        required: true
        schema:
          type: string
      - name: limit
        in: query
        description: The number of jobs per page.
        schema:
          type: integer
      - name: continuationToken
        in: query
        description: The continuation token of the next page of jobs to be returned.
        schema:
          type: string
      responses:
        200:
          description: The requested fetched urls.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/urlPager'
        default:
          description: The resulting error payload.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/errorResult'
  /crawls/{jobId}/excluded:
    get:
      tags:
      - crawls
      summary: A pageable list of excluded urls.
      description: Get a list of excluded urls.
      operationId: getCrawlExcluded
      parameters:
      - name: jobId
        in: path
        description: identifier used to identify a specific crawl.
        required: true
        schema:
          type: string
      - name: limit
        in: query
        description: The number of jobs per page.
        schema:
          type: integer
      - name: continuationToken
        in: query
        description: The continuation token of the next page of urls to be returned.
        schema:
          type: string
      responses:
        200:
          description: The requested excluded urls.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/urlPager'
        default:
          description: The resulting error payload.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/errorResult'
  /crawls/{jobId}/notModified:
    get:
      tags:
      - crawls
      summary: A pageable list of not modified urls.
      description: Get a list of not modified urls.
      operationId: getCrawlNotModified
      parameters:
      - name: jobId
        in: path
        required: true
        schema:
          type: string
      - name: limit
        in: query
        description: The number of jobs per page.
        schema:
          type: integer
      - name: continuationToken
        in: query
        description: The continuation token of the next page of urls to be returned.
        schema:
          type: string
      responses:
        200:
          description: The requested not modified urls.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/urlPager'
        default:
          description: The resulting error payload.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/errorResult'
  /crawls/{jobId}/pending:
    get:
      tags:
      - crawls
      summary: A pageable list of pending urls.
      description: Get a list of pending urls.
      operationId: getCrawlPending
      parameters:
      - name: jobId
        in: path
        required: true
        schema:
          type: string
      - name: limit
        in: query
        description: The number of jobs per page.
        schema:
          type: integer
      - name: continuationToken
        in: query
        description: The continuation token of the next page of urls to be returned.
        schema:
          type: string
      responses:
        200:
          description: The requested pending urls.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/urlPager'
        default:
          description: The resulting error payload.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/errorResult'
  /crawls/{jobId}/parsed:
    get:
      tags:
      - crawls
      summary: A pageable list of parsed urls.
      description: Get a list of parsed urls.
      operationId: getCrawlParsed
      parameters:
      - name: jobId
        in: path
        required: true
        schema:
          type: string
      - name: limit
        in: query
        description: The number of jobs per page.
        schema:
          type: integer
      - name: continuationToken
        in: query
        description: The continuation token of the next page of urls to be returned.
        schema:
          type: string
      responses:
        200:
          description: The requested parsed urls.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/urlPager'
        default:
          description: The resulting error payload.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/errorResult'
  /crawls/{jobId}/errors:
    get:
      tags:
      - crawls
      summary: A pageable list of urls with errors.
      description: Get a list of urls with errors.
      operationId: getCrawlErrors
      parameters:
      - name: jobId
        in: path
        required: true
        schema:
          type: string
      - name: limit
        in: query
        description: The number of jobs per page.
        schema:
          type: integer
      - name: continuationToken
        in: query
        description: The continuation token of the next page of urls to be returned.
        schema:
          type: string
      responses:
        200:
          description: The requested urls with errors.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/urlPager'
        default:
          description: The resulting error payload.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/errorResult'
  /crawls/{jobId}/mimeType/{type}:
    get:
      tags:
      - crawls
      summary: A pageable list of urls of mimetype.
      description: Get a list of urls of mimetype.
      operationId: getCrawlByMimeType
      parameters:
      - name: jobId
        in: path
        required: true
        schema:
          type: string
      - name: type
        in: path
        required: true
        schema:
          type: string
      - name: limit
        in: query
        description: The number of jobs per page.
        schema:
          type: integer
      - name: continuationToken
        in: query
        description: The continuation token of the next page of urls to be returned.
        schema:
          type: string
      responses:
        200:
          description: The requested urls.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/urlPager'
        default:
          description: The resulting error payload.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/errorResult'
  /jobs:
    get:
      tags:
      - jobs
      summary: Get the list of crawl jobs.
      description: Get a list of crawl jobs a pageful at a time as defined by the
        continuation token and limit.
      operationId: getJobs
      parameters:
      - name: limit
        in: query
        description: The number of jobs per page
        schema:
          type: integer
          default: 50
      - name: continuationToken
        in: query
        description: The continuation token of the next page of jobs to be returned.
        schema:
          type: string
      responses:
        200:
          description: The requested crawls
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/jobPager'
        default:
          description: The resulting error payload.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/errorResult'
    post:
      tags:
      - jobs
      summary: Request a crawl as defined by the descriptor
      description: Enqueue a new crawl job as defined by the crawl descriptor and
        return it.
      operationId: queueJob
      requestBody:
        description: crawl request
        content:
          '*/*':
            schema:
              $ref: '#/components/schemas/crawlDesc'
        required: true
      responses:
        202:
          description: The crawl request has been queued for operation.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/crawlJob'
        default:
          description: The resulting error payload.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/errorResult'
      x-codegen-request-body-name: crawlDesc
    delete:
      tags:
      - jobs
      summary: Delete all of the currently queued and active jobs
      description: Halt and delete all of the currently queued and active crawl jobs
      operationId: deleteJobs
      responses:
        200:
          description: All crawl jobs have been stopped and deleted.
          content: {}
        default:
          description: The resulting error payload.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/errorResult'
  /jobs/{jobId}:
    get:
      tags:
      - jobs
      summary: Get the crawl status of this job
      description: Get the crawl job with a given crawl id
      operationId: getCrawlJob
      parameters:
      - name: jobId
        in: path
        required: true
        schema:
          type: string
      responses:
        200:
          description: The crawl Job of the requested crawl
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/crawlJob'
        default:
          description: The resulting error payload.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/errorResult'
    delete:
      tags:
      - jobs
      summary: Remove or stop a crawl job.
      description: Delete a crawl job with the given job id, stopping any current
        processing, if necessary.
      operationId: deleteCrawlJob
      parameters:
      - name: jobId
        in: path
        description: The identifier used to identify a specific crawl job.
        required: true
        schema:
          type: string
      responses:
        200:
          description: The crawlJob of the deleted crawl.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/crawlJob'
        default:
          description: The resulting error payload.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/errorResult'
  /status:
    get:
      tags:
      - status
      summary: Get the status of the service
      description: Get the status of the service
      operationId: getStatus
      responses:
        200:
          description: The status of the service
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/apiStatus'
        default:
          description: The resulting error payload.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/errorResult'
  /ws/crawls:
    get:
      tags:
      - ws
      summary: Query for list of crawls based on subset defined by query string
      description: Query for crawls that meet a set of specified conditions
      operationId: getWsCrawls
      parameters:
      - name: crawlQuery
        in: query
        description: The query that specifies the crawls to be returned
        required: true
        schema:
          type: string
      responses:
        200:
          description: Information about the requested crawls
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/crawlWsResult'
        default:
          description: The resulting error payload.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/errorResult'
components:
  schemas:
    errorResult:
      type: object
      required:
      - message
      - code
      properties:
        message:
          type: string
        code:
          type: integer
        rootCause:
          type: string
    crawlerStatuses:
      type: object
      properties:
        crawlerMap:
          type: object
          additionalProperties:
            $ref: '#/components/schemas/crawlerStatus'
          description: An map of crawler status objects
      description: The metadata generated for a single item
    mimeCounter:
      required:
      - mimeType
      type: object
      properties:
        mimeType:
          type: string
          description: The mime type to count.
        count:
          type: integer
          description: The number of elements of mime type
          format: int32
        counterLink:
          type: string
          description: A link to the list of count elements or to a pager with
            count elements.
      description: A counter for mimeTypes seen during a crawl.
    pageInfo:
      required:
      - continuationToken
      - curLink
      - resultsPerPage
      - totalCount
      type: object
      properties:
        totalCount:
          type: integer
          description: The total number of elements to be paginated
          format: int32
        resultsPerPage:
          type: integer
          description: The number of results per page.
          format: int32
        continuationToken:
          type: string
          description: The continuation token.
        curLink:
          type: string
          description: The link to the current page.
        nextLink:
          type: string
          description: The link to the next page.
      description: The information related to pagination of content
    jobStatus:
      required:
      - statusCode
      type: object
      properties:
        statusCode:
          type: string
          description: The numeric value for this status.
          enum:
          - STATUS_UNKNOWN
          - STATUS_QUEUED
          - STATUS_ACTIVE
          - STATUS_SUCCESSFUL
          - STATUS_ERROR
          - STATUS_ABORTED
          - STATUS_WINDOW_CLOSED
          - STATUS_FETCH_ERROR
          - STATUS_NO_PUB_PERMISSION
          - STATUS_PLUGIN_ERROR
          - STATUS_REPO_ERR
          - STATUS_RUNNING_AT_CRASH
          - STATUS_EXTRACTOR_ERROR
          - STATUS_CRAWL_TEST_SUCCESSFUL
          - STATUS_CRAWL_TEST_FAIL
          - STATUS_INELIGIBLE
          - STATUS_INACTIVE_REQUEST
          - STATUS_INTERRUPTED
        msg:
          type: string
          description: A text message explaining this status.
      description: A status which includes a code and a message.
    apiStatus:
      required:
      - apiVersion
      - ready
      type: object
      properties:
        apiVersion:
          type: string
          description: The version of the API
        componentName:
          type: string
          description: The name of the component
        componentVersion:
          type: string
          description: The version of the component software
        lockssVersion:
          type: string
          description: The version of the LOCKSS system
        ready:
          type: boolean
          description: The indication of whether the service is available
        serviceName:
          type: string
          description: The name of the service
        readyTime:
          type: integer
          description: The time the service last became ready.
          format: int64
        reason:
          type: string
          description: The reason the service isn't ready.
        startupStatus:
          type: string
          description: Enum indicating progress of plugin/AU processing at startup.
          enum:
          - NONE
          - PLUGINS_CRAWLING
          - PLUGINS_COLLECTED
          - PLUGINS_LOADING
          - PLUGINS_LOADED
          - AUS_STARTING
          - AUS_STARTED
      description: The status information of the service
    counter:
      required:
      - count
      - itemsLink
      type: object
      properties:
        count:
          type: integer
          description: The number of elements
          format: int32
        itemsLink:
          type: string
          description: A link to the list of count items or to a pager with count\
            \ items.
      description: A counter for urls.
    crawlDesc:
      required:
      - auId
      - crawlKind
      type: object
      properties:
        auId:
          type: string
          description: The identifier of the archival unit to be crawled.
        crawlKind:
          type: string
          description: The kind of crawl being performed either 'newContent' or 'repair'.
          enum:
          - newContent
          - repair
        crawlerId:
          type: string
          description: The crawler to be used for this crawl.
          default: classic
        forceCrawl:
          type: boolean
          description: An indication of whether the crawl is to be forced, suppressing
            conditions that might otherwise prevent the crawl from happening.
          default: false
        refetchDepth:
          type: integer
          description: The refetch depth to use for a deep crawl.
          format: int32
          default: -1
        priority:
          type: integer
          description: The priority for the crawl.
          format: int32
          default: 0
        crawlList:
          type: array
          description: The list of URLs to crawl.
          items:
            type: string
        crawlDepth:
          type: integer
          description: The depth to which the links should be followed. 0 means
            do not follow links.
          format: int32
        extraCrawlerData:
          type: object
          additionalProperties:
            type: object
            properties: {}
          description: A map of additional properties for a crawl on a given crawler.
      description: A descriptor for a crawl.
    crawlJob:
      required:
      - crawlDesc
      - jobId
      - jobStatus
      - requestDate
      type: object
      properties:
        crawlDesc:
          $ref: '#/components/schemas/crawlDesc'
        requestDate:
          type: integer
          description: The timestamp when the crawl was requested.
          format: int64
        jobId:
          type: string
          description: Identifier of the crawl job.
        jobStatus:
          $ref: '#/components/schemas/jobStatus'
        startDate:
          type: integer
          description: The timestamp when the crawl began.
          format: int64
        endDate:
          type: integer
          description: The timestamp when the crawl ended.
          format: int64
        result:
          type: string
          description: A URI which can be used to retrieve the crawl data.
      description: The job resulting from a request to perform a crawl.
    crawlerStatus:
      required:
      - isEnabled
      type: object
      properties:
        isEnabled:
          type: boolean
          description: Is the crawler enabled
        isAutoCrawlEnabled:
          type: boolean
          description: Does crawler autocrawl AUs when needed.
        numJobsActive:
          type: integer
          description: The number of jobs running.
          format: int32
        numJobsFailed:
          type: integer
          description: The number of jobs failed.
          format: int32
        numJobsSuccessful:
          type: integer
          description: The number of jobs succeeded
          format: int32
        numJobsPending:
          type: integer
          description: The number of active jobs
          format: int32
        errMessage:
          type: string
      description: Status about a specific crawler.
    crawlerConfig:
      required:
      - attributes
      - crawlerId
      type: object
      properties:
        crawlerId:
          type: string
          description: The identifier for this crawler
          example: classic
        attributes:
          type: object
          additionalProperties:
            type: string
          description: key value pairs specific providing attributes and configuration
            information.
      description: Configuration information about a specific crawler.
    crawlStatus:
      required:
      - auId
      - auName
      - crawlerId
      - endTime
      - jobId
      - jobStatus
      - priority
      - startTime
      - startUrls
      - type
      type: object
      properties:
        jobId:
          type: string
          description: The id for the crawl.
        auId:
          type: string
          description: The id for the au.
        auName:
          type: string
          description: The name for the au.
        type:
          type: string
          description: The type of crawl.
        startUrls:
          type: array
          description: The array of start urls.
          items:
            type: string
        priority:
          type: integer
          description: The priority for this crawl.
          format: int32
        crawlerId:
          type: string
          description: The id of the crawler used for this crawl.
          default: classic
        sources:
          type: array
          description: The sources to use for the crawl.
          items:
            type: string
        depth:
          type: integer
          description: The depth of the crawl.
          format: int32
        refetchDepth:
          type: integer
          description: The refetch depth of the crawl.
          format: int32
        proxy:
          type: string
          description: The proxy used for crawling.
        startTime:
          type: integer
          description: The timestamp for the start of crawl.
          format: int64
        endTime:
          type: integer
          description: The timestamp for the end of the crawl.
          format: int64
        jobStatus:
          $ref: '#/components/schemas/jobStatus'
        isWaiting:
          type: boolean
          description: True if the crawl waiting to start.
        isActive:
          type: boolean
          description: True if the crawl is active.
        isError:
          type: boolean
          description: True if the crawl has errored.
        bytesFetched:
          type: integer
          description: The number of bytes fetched.
          format: int64
        fetchedItems:
          $ref: '#/components/schemas/counter'
        excludedItems:
          $ref: '#/components/schemas/counter'
        notModifiedItems:
          $ref: '#/components/schemas/counter'
        parsedItems:
          $ref: '#/components/schemas/counter'
        pendingItems:
          $ref: '#/components/schemas/counter'
        errors:
          $ref: '#/components/schemas/counter'
        mimeTypes:
          type: array
          description: The list of urls by mimeType.
          items:
            $ref: '#/components/schemas/mimeCounter'
      description: The status of a single crawl.
    urlPager:
      required:
      - pageInfo
      - urls
      type: object
      properties:
        pageInfo:
          $ref: '#/components/schemas/pageInfo'
        urls:
          type: array
          description: An list of url with related info.
          items:
            $ref: '#/components/schemas/urlInfo'
      description: A Pager for urls with maps.
    jobPager:
      required:
      - jobs
      - pageInfo
      type: object
      properties:
        jobs:
          type: array
          description: The jobs displayed in the page
          items:
            $ref: '#/components/schemas/crawlJob'
        pageInfo:
          $ref: '#/components/schemas/pageInfo'
      description: A display page of jobs
    crawlPager:
      required:
      - crawls
      - pageInfo
      type: object
      properties:
        crawls:
          type: array
          description: The crawls displayed in the page
          items:
            $ref: '#/components/schemas/crawlStatus'
        pageInfo:
          $ref: '#/components/schemas/pageInfo'
      description: A display page of crawl status
    urlInfo:
      required:
      - url
      type: object
      properties:
        url:
          type: string
          description: The url string
        error:
          $ref: '#/components/schemas/urlError'
        referrers:
          type: array
          description: An optional list of referrers.
          items:
            type: string
      description: information related to an url.
    urlError:
      required:
      - message
      - severity
      type: object
      properties:
        message:
          type: string
          description: The error message
        severity:
          type: string
          description: the severity of the error.
          enum:
          - Warning
          - Error
          - Fatal
      description: information related to an error for a url.
    crawlWsResult:
      required:
      - auId
      - auName
      type: object
      properties:
        auId:
          type: string
        auName:
          type: string
        priority:
          type: integer
          format: int32
        crawlKey:
          type: string
        crawlType:
          type: string
        startTime:
          type: integer
          format: int32
        duration:
          type: integer
          format: int32
        crawlStatus:
          type: string
        bytesFetchedCount:
          type: integer
          format: int32
        pagesFetchedCount:
          type: integer
          format: int32
        pagesFetched:
          type: array
          items:
            type: string
        pagesParsedCount:
          type: integer
          format: int32
        pagesParsed:
          type: array
          items:
            type: string
        pagesPendingCount:
          type: integer
          format: int32
        pagesPending:
          type: array
          items:
            type: string
        pagesExcludedCount:
          type: integer
          format: int32
        pagesExcluded:
          type: array
          items:
            type: string
        offSiteUrlsExcludedCount:
          type: integer
          format: int32
        pagesNotModifiedCount:
          type: integer
          format: int32
        pagesNotModified:
          type: array
          items:
            type: string
        pagesWithErrorsCount:
          type: integer
          format: int32
        pagesWithErrors:
          type: array
          items:
            type: object
            properties:
              url:
                type: string
              severity:
                type: string
              message:
                type: string
        mimeTypeCount:
          type: integer
          format: int32
        mimeTypes:
          type: array
          items:
            type: string
        sources:
          type: array
          items:
            type: string
        startingUrls:
          type: array
          items:
            type: string
        refetchDepth:
          type: integer
          format: int32
        linkDepth:
          type: integer
          format: int32
  securitySchemes:
    basicAuth:
      type: http
      description: HTTP Basic Authentication. Works over `HTTP` and `HTTPS`
      scheme: basic