openapi: 3.0.3 info: title: LOCKSS Crawler Service REST API description: REST API of the LOCKSS Crawler Service contact: name: LOCKSS Support url: https://www.lockss.org/ email: lockss-support@lockss.org license: name: BSD-3-Clause url: https://opensource.org/licenses/BSD-3-Clause version: 2.0.0 servers: - url: https://laaws.lockss.org:443/ security: - basicAuth: [] tags: - name: crawls description: requests related to crawls - name: crawlers description: requests related to crawlers - name: jobs description: requests related to crawl jobs - name: ws description: legacy SOAP endpoint paths: /crawlers: get: tags: - crawlers summary: Get the list of supported crawlers. description: Return the list of supported crawlers. operationId: getCrawlers responses: 200: description: The Status of supported Crawlers. content: application/json: schema: $ref: '#/components/schemas/crawlerStatuses' default: description: The resulting error payload. content: application/json: schema: $ref: '#/components/schemas/errorResult' /crawlers/{crawlerId}: get: tags: - crawlers summary: Return information about a crawler. description: Get information related to a installed crawler. operationId: getCrawlerConfig parameters: - name: crawlerId in: path description: Identifier for the crawler required: true schema: type: string responses: 200: description: Crawler Configuration Found content: application/json: schema: $ref: '#/components/schemas/crawlerConfig' default: description: The resulting error payload. content: application/json: schema: $ref: '#/components/schemas/errorResult' /crawls: get: tags: - crawls summary: Get the list of crawls. description: Get a list of crawls a pageful at a time as defined by limit. operationId: getCrawls parameters: - name: limit in: query description: The number of jobs per page schema: type: integer default: 50 - name: continuationToken in: query description: The continuation token of the next page of crawl status data to be returned. schema: type: string responses: 200: description: The requested crawls content: application/json: schema: $ref: '#/components/schemas/crawlPager' default: description: The resulting error payload. content: application/json: schema: $ref: '#/components/schemas/errorResult' /crawls/{jobId}: get: tags: - crawls summary: Get the crawl status of this job description: Get the job represented by this crawl id operationId: getCrawlById parameters: - name: jobId in: path required: true schema: type: string responses: 200: description: The crawl status of the requested crawl content: application/json: schema: $ref: '#/components/schemas/crawlStatus' default: description: The resulting error payload. content: application/json: schema: $ref: '#/components/schemas/errorResult' /crawls/{jobId}/fetched: get: tags: - crawls summary: A pageable list of fetched urls. description: Get a list of fetched urls. operationId: getCrawlFetched parameters: - name: jobId in: path required: true schema: type: string - name: limit in: query description: The number of jobs per page. schema: type: integer - name: continuationToken in: query description: The continuation token of the next page of jobs to be returned. schema: type: string responses: 200: description: The requested fetched urls. content: application/json: schema: $ref: '#/components/schemas/urlPager' default: description: The resulting error payload. content: application/json: schema: $ref: '#/components/schemas/errorResult' /crawls/{jobId}/excluded: get: tags: - crawls summary: A pageable list of excluded urls. description: Get a list of excluded urls. operationId: getCrawlExcluded parameters: - name: jobId in: path description: identifier used to identify a specific crawl. required: true schema: type: string - name: limit in: query description: The number of jobs per page. schema: type: integer - name: continuationToken in: query description: The continuation token of the next page of urls to be returned. schema: type: string responses: 200: description: The requested excluded urls. content: application/json: schema: $ref: '#/components/schemas/urlPager' default: description: The resulting error payload. content: application/json: schema: $ref: '#/components/schemas/errorResult' /crawls/{jobId}/notModified: get: tags: - crawls summary: A pageable list of not modified urls. description: Get a list of not modified urls. operationId: getCrawlNotModified parameters: - name: jobId in: path required: true schema: type: string - name: limit in: query description: The number of jobs per page. schema: type: integer - name: continuationToken in: query description: The continuation token of the next page of urls to be returned. schema: type: string responses: 200: description: The requested not modified urls. content: application/json: schema: $ref: '#/components/schemas/urlPager' default: description: The resulting error payload. content: application/json: schema: $ref: '#/components/schemas/errorResult' /crawls/{jobId}/pending: get: tags: - crawls summary: A pageable list of pending urls. description: Get a list of pending urls. operationId: getCrawlPending parameters: - name: jobId in: path required: true schema: type: string - name: limit in: query description: The number of jobs per page. schema: type: integer - name: continuationToken in: query description: The continuation token of the next page of urls to be returned. schema: type: string responses: 200: description: The requested pending urls. content: application/json: schema: $ref: '#/components/schemas/urlPager' default: description: The resulting error payload. content: application/json: schema: $ref: '#/components/schemas/errorResult' /crawls/{jobId}/parsed: get: tags: - crawls summary: A pageable list of parsed urls. description: Get a list of parsed urls. operationId: getCrawlParsed parameters: - name: jobId in: path required: true schema: type: string - name: limit in: query description: The number of jobs per page. schema: type: integer - name: continuationToken in: query description: The continuation token of the next page of urls to be returned. schema: type: string responses: 200: description: The requested parsed urls. content: application/json: schema: $ref: '#/components/schemas/urlPager' default: description: The resulting error payload. content: application/json: schema: $ref: '#/components/schemas/errorResult' /crawls/{jobId}/errors: get: tags: - crawls summary: A pageable list of urls with errors. description: Get a list of urls with errors. operationId: getCrawlErrors parameters: - name: jobId in: path required: true schema: type: string - name: limit in: query description: The number of jobs per page. schema: type: integer - name: continuationToken in: query description: The continuation token of the next page of urls to be returned. schema: type: string responses: 200: description: The requested urls with errors. content: application/json: schema: $ref: '#/components/schemas/urlPager' default: description: The resulting error payload. content: application/json: schema: $ref: '#/components/schemas/errorResult' /crawls/{jobId}/mimeType/{type}: get: tags: - crawls summary: A pageable list of urls of mimetype. description: Get a list of urls of mimetype. operationId: getCrawlByMimeType parameters: - name: jobId in: path required: true schema: type: string - name: type in: path required: true schema: type: string - name: limit in: query description: The number of jobs per page. schema: type: integer - name: continuationToken in: query description: The continuation token of the next page of urls to be returned. schema: type: string responses: 200: description: The requested urls. content: application/json: schema: $ref: '#/components/schemas/urlPager' default: description: The resulting error payload. content: application/json: schema: $ref: '#/components/schemas/errorResult' /jobs: get: tags: - jobs summary: Get the list of crawl jobs. description: Get a list of crawl jobs a pageful at a time as defined by the continuation token and limit. operationId: getJobs parameters: - name: limit in: query description: The number of jobs per page schema: type: integer default: 50 - name: continuationToken in: query description: The continuation token of the next page of jobs to be returned. schema: type: string responses: 200: description: The requested crawls content: application/json: schema: $ref: '#/components/schemas/jobPager' default: description: The resulting error payload. content: application/json: schema: $ref: '#/components/schemas/errorResult' post: tags: - jobs summary: Request a crawl as defined by the descriptor description: Enqueue a new crawl job as defined by the crawl descriptor and return it. operationId: queueJob requestBody: description: crawl request content: '*/*': schema: $ref: '#/components/schemas/crawlDesc' required: true responses: 202: description: The crawl request has been queued for operation. content: application/json: schema: $ref: '#/components/schemas/crawlJob' default: description: The resulting error payload. content: application/json: schema: $ref: '#/components/schemas/errorResult' x-codegen-request-body-name: crawlDesc delete: tags: - jobs summary: Delete all of the currently queued and active jobs description: Halt and delete all of the currently queued and active crawl jobs operationId: deleteJobs responses: 200: description: All crawl jobs have been stopped and deleted. content: {} default: description: The resulting error payload. content: application/json: schema: $ref: '#/components/schemas/errorResult' /jobs/{jobId}: get: tags: - jobs summary: Get the crawl status of this job description: Get the crawl job with a given crawl id operationId: getCrawlJob parameters: - name: jobId in: path required: true schema: type: string responses: 200: description: The crawl Job of the requested crawl content: application/json: schema: $ref: '#/components/schemas/crawlJob' default: description: The resulting error payload. content: application/json: schema: $ref: '#/components/schemas/errorResult' delete: tags: - jobs summary: Remove or stop a crawl job. description: Delete a crawl job with the given job id, stopping any current processing, if necessary. operationId: deleteCrawlJob parameters: - name: jobId in: path description: The identifier used to identify a specific crawl job. required: true schema: type: string responses: 200: description: The crawlJob of the deleted crawl. content: application/json: schema: $ref: '#/components/schemas/crawlJob' default: description: The resulting error payload. content: application/json: schema: $ref: '#/components/schemas/errorResult' /status: get: tags: - status summary: Get the status of the service description: Get the status of the service operationId: getStatus responses: 200: description: The status of the service content: application/json: schema: $ref: '#/components/schemas/apiStatus' default: description: The resulting error payload. content: application/json: schema: $ref: '#/components/schemas/errorResult' /ws/crawls: get: tags: - ws summary: Query for list of crawls based on subset defined by query string description: Query for crawls that meet a set of specified conditions operationId: getWsCrawls parameters: - name: crawlQuery in: query description: The query that specifies the crawls to be returned required: true schema: type: string responses: 200: description: Information about the requested crawls content: application/json: schema: $ref: '#/components/schemas/crawlWsResult' default: description: The resulting error payload. content: application/json: schema: $ref: '#/components/schemas/errorResult' components: schemas: errorResult: type: object required: - message - code properties: message: type: string code: type: integer rootCause: type: string crawlerStatuses: type: object properties: crawlerMap: type: object additionalProperties: $ref: '#/components/schemas/crawlerStatus' description: An map of crawler status objects description: The metadata generated for a single item mimeCounter: required: - mimeType type: object properties: mimeType: type: string description: The mime type to count. count: type: integer description: The number of elements of mime type format: int32 counterLink: type: string description: A link to the list of count elements or to a pager with count elements. description: A counter for mimeTypes seen during a crawl. pageInfo: required: - continuationToken - curLink - resultsPerPage - totalCount type: object properties: totalCount: type: integer description: The total number of elements to be paginated format: int32 resultsPerPage: type: integer description: The number of results per page. format: int32 continuationToken: type: string description: The continuation token. curLink: type: string description: The link to the current page. nextLink: type: string description: The link to the next page. description: The information related to pagination of content jobStatus: required: - statusCode type: object properties: statusCode: type: string description: The numeric value for this status. enum: - STATUS_UNKNOWN - STATUS_QUEUED - STATUS_ACTIVE - STATUS_SUCCESSFUL - STATUS_ERROR - STATUS_ABORTED - STATUS_WINDOW_CLOSED - STATUS_FETCH_ERROR - STATUS_NO_PUB_PERMISSION - STATUS_PLUGIN_ERROR - STATUS_REPO_ERR - STATUS_RUNNING_AT_CRASH - STATUS_EXTRACTOR_ERROR - STATUS_CRAWL_TEST_SUCCESSFUL - STATUS_CRAWL_TEST_FAIL - STATUS_INELIGIBLE - STATUS_INACTIVE_REQUEST - STATUS_INTERRUPTED msg: type: string description: A text message explaining this status. description: A status which includes a code and a message. apiStatus: required: - apiVersion - ready type: object properties: apiVersion: type: string description: The version of the API componentName: type: string description: The name of the component componentVersion: type: string description: The version of the component software lockssVersion: type: string description: The version of the LOCKSS system ready: type: boolean description: The indication of whether the service is available serviceName: type: string description: The name of the service readyTime: type: integer description: The time the service last became ready. format: int64 reason: type: string description: The reason the service isn't ready. startupStatus: type: string description: Enum indicating progress of plugin/AU processing at startup. enum: - NONE - PLUGINS_CRAWLING - PLUGINS_COLLECTED - PLUGINS_LOADING - PLUGINS_LOADED - AUS_STARTING - AUS_STARTED description: The status information of the service counter: required: - count - itemsLink type: object properties: count: type: integer description: The number of elements format: int32 itemsLink: type: string description: A link to the list of count items or to a pager with count\ \ items. description: A counter for urls. crawlDesc: required: - auId - crawlKind type: object properties: auId: type: string description: The identifier of the archival unit to be crawled. crawlKind: type: string description: The kind of crawl being performed either 'newContent' or 'repair'. enum: - newContent - repair crawlerId: type: string description: The crawler to be used for this crawl. default: classic forceCrawl: type: boolean description: An indication of whether the crawl is to be forced, suppressing conditions that might otherwise prevent the crawl from happening. default: false refetchDepth: type: integer description: The refetch depth to use for a deep crawl. format: int32 default: -1 priority: type: integer description: The priority for the crawl. format: int32 default: 0 crawlList: type: array description: The list of URLs to crawl. items: type: string crawlDepth: type: integer description: The depth to which the links should be followed. 0 means do not follow links. format: int32 extraCrawlerData: type: object additionalProperties: type: object properties: {} description: A map of additional properties for a crawl on a given crawler. description: A descriptor for a crawl. crawlJob: required: - crawlDesc - jobId - jobStatus - requestDate type: object properties: crawlDesc: $ref: '#/components/schemas/crawlDesc' requestDate: type: integer description: The timestamp when the crawl was requested. format: int64 jobId: type: string description: Identifier of the crawl job. jobStatus: $ref: '#/components/schemas/jobStatus' startDate: type: integer description: The timestamp when the crawl began. format: int64 endDate: type: integer description: The timestamp when the crawl ended. format: int64 result: type: string description: A URI which can be used to retrieve the crawl data. description: The job resulting from a request to perform a crawl. crawlerStatus: required: - isEnabled type: object properties: isEnabled: type: boolean description: Is the crawler enabled isAutoCrawlEnabled: type: boolean description: Does crawler autocrawl AUs when needed. numJobsActive: type: integer description: The number of jobs running. format: int32 numJobsFailed: type: integer description: The number of jobs failed. format: int32 numJobsSuccessful: type: integer description: The number of jobs succeeded format: int32 numJobsPending: type: integer description: The number of active jobs format: int32 errMessage: type: string description: Status about a specific crawler. crawlerConfig: required: - attributes - crawlerId type: object properties: crawlerId: type: string description: The identifier for this crawler example: classic attributes: type: object additionalProperties: type: string description: key value pairs specific providing attributes and configuration information. description: Configuration information about a specific crawler. crawlStatus: required: - auId - auName - crawlerId - endTime - jobId - jobStatus - priority - startTime - startUrls - type type: object properties: jobId: type: string description: The id for the crawl. auId: type: string description: The id for the au. auName: type: string description: The name for the au. type: type: string description: The type of crawl. startUrls: type: array description: The array of start urls. items: type: string priority: type: integer description: The priority for this crawl. format: int32 crawlerId: type: string description: The id of the crawler used for this crawl. default: classic sources: type: array description: The sources to use for the crawl. items: type: string depth: type: integer description: The depth of the crawl. format: int32 refetchDepth: type: integer description: The refetch depth of the crawl. format: int32 proxy: type: string description: The proxy used for crawling. startTime: type: integer description: The timestamp for the start of crawl. format: int64 endTime: type: integer description: The timestamp for the end of the crawl. format: int64 jobStatus: $ref: '#/components/schemas/jobStatus' isWaiting: type: boolean description: True if the crawl waiting to start. isActive: type: boolean description: True if the crawl is active. isError: type: boolean description: True if the crawl has errored. bytesFetched: type: integer description: The number of bytes fetched. format: int64 fetchedItems: $ref: '#/components/schemas/counter' excludedItems: $ref: '#/components/schemas/counter' notModifiedItems: $ref: '#/components/schemas/counter' parsedItems: $ref: '#/components/schemas/counter' pendingItems: $ref: '#/components/schemas/counter' errors: $ref: '#/components/schemas/counter' mimeTypes: type: array description: The list of urls by mimeType. items: $ref: '#/components/schemas/mimeCounter' description: The status of a single crawl. urlPager: required: - pageInfo - urls type: object properties: pageInfo: $ref: '#/components/schemas/pageInfo' urls: type: array description: An list of url with related info. items: $ref: '#/components/schemas/urlInfo' description: A Pager for urls with maps. jobPager: required: - jobs - pageInfo type: object properties: jobs: type: array description: The jobs displayed in the page items: $ref: '#/components/schemas/crawlJob' pageInfo: $ref: '#/components/schemas/pageInfo' description: A display page of jobs crawlPager: required: - crawls - pageInfo type: object properties: crawls: type: array description: The crawls displayed in the page items: $ref: '#/components/schemas/crawlStatus' pageInfo: $ref: '#/components/schemas/pageInfo' description: A display page of crawl status urlInfo: required: - url type: object properties: url: type: string description: The url string error: $ref: '#/components/schemas/urlError' referrers: type: array description: An optional list of referrers. items: type: string description: information related to an url. urlError: required: - message - severity type: object properties: message: type: string description: The error message severity: type: string description: the severity of the error. enum: - Warning - Error - Fatal description: information related to an error for a url. crawlWsResult: required: - auId - auName type: object properties: auId: type: string auName: type: string priority: type: integer format: int32 crawlKey: type: string crawlType: type: string startTime: type: integer format: int32 duration: type: integer format: int32 crawlStatus: type: string bytesFetchedCount: type: integer format: int32 pagesFetchedCount: type: integer format: int32 pagesFetched: type: array items: type: string pagesParsedCount: type: integer format: int32 pagesParsed: type: array items: type: string pagesPendingCount: type: integer format: int32 pagesPending: type: array items: type: string pagesExcludedCount: type: integer format: int32 pagesExcluded: type: array items: type: string offSiteUrlsExcludedCount: type: integer format: int32 pagesNotModifiedCount: type: integer format: int32 pagesNotModified: type: array items: type: string pagesWithErrorsCount: type: integer format: int32 pagesWithErrors: type: array items: type: object properties: url: type: string severity: type: string message: type: string mimeTypeCount: type: integer format: int32 mimeTypes: type: array items: type: string sources: type: array items: type: string startingUrls: type: array items: type: string refetchDepth: type: integer format: int32 linkDepth: type: integer format: int32 securitySchemes: basicAuth: type: http description: HTTP Basic Authentication. Works over `HTTP` and `HTTPS` scheme: basic