Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Manifest enrichment #699

Open
wants to merge 10 commits into
base: feature/IDA-893-enrichment-integration
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 23 additions & 2 deletions services/madoc-ts/src/extensions/enrichment/extension.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { Topic, TopicType, TopicTypeListResponse } from '../../types/schemas/topics';
import { BaseDjangoExtension } from './base-django-extension';
import { EnrichmentIndexPayload } from './types';
import { EnrichmentIndexPayload, EnrichmentPlaintext, EnrichmentTask } from './types';
import { ApiKey } from '../../types/api-key';
import { SearchQuery, SearchResponse } from '../../types/search';
import {
Expand Down Expand Up @@ -79,7 +79,28 @@ export class EnrichmentExtension extends BaseDjangoExtension {
}

getEnrichmentTask(id: string) {
return this.api.request(`/api/enrichment/task_log/${id}`);
return this.api.request<EnrichmentTask>(`/api/enrichment/task_log/${id}`);
}
getEnrichmentPlaintext(id: string) {
return this.api.request<EnrichmentPlaintext>(`/api/enrichment/plaintext/${id}`);
}

enrichManifest(id: number) {
return this.api.request<EnrichmentTask>(`/api/madoc/iiif/manifests/${id}/enrichment`, {
method: 'POST',
});
}

enrichManifestInternal(id: number, callback?: string) {
return this.api.request<EnrichmentTask>(`/api/enrichment/tasks/madoc_manifest_enrichment_pipeline`, {
method: 'POST',
body: {
task: {
subject: `urn:madoc:manifest:${id}`,
parameters: [{ callback_url: callback }],
},
},
});
}
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the call to kick off the enrichment pipeline


allTasks = [
Expand Down
35 changes: 35 additions & 0 deletions services/madoc-ts/src/extensions/enrichment/types.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { Canvas, Collection, InternationalString, Manifest } from '@iiif/presentation-3';
import { BaseTask } from '../../gateway/tasks/base-task';

export type EnrichmentIndexPayload = {
madoc_id: string;
Expand All @@ -16,3 +17,37 @@ export interface DjangoPagination<T> {
previous: string;
results: T[];
}

export interface EnrichmentTaskSnippet {
url: string;
id: string;
created: string;
modified: string;
name: string;
state: any;
status: number;
status_text: string;
task_type: string;
task_class: string;
}

export interface EnrichmentTask extends Omit<BaseTask, 'parent_task' | 'type' | 'subtasks'> {
url: string;
task_type: string; // Type
parent_task: EnrichmentTaskSnippet;
errors: string[];
child_tasks: EnrichmentTaskSnippet[];
task_class: string;
}

export interface EnrichmentPlaintext {
url: string;
id: string;
created: string;
modified: string;
source: string;
ocr_backend: string;
ocr_format: string;
plaintext: string;
plaintext_list: string;
}
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ export const ManifestSearchIndex = createUniversalComponent<ManifestSearchIndexT

const api = useApi();
const [invokeEnrichment, { isLoading: enrichLoading }] = useMutation(async () => {
await api.triggerSearchIndex(Number(id), 'manifest');
await api.enrichment.enrichManifest(Number(id));
await refetch();
});

Expand Down
15 changes: 14 additions & 1 deletion services/madoc-ts/src/gateway/api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -153,14 +153,14 @@ export class ApiClient {
// Enrichment
this.authority = new AuthorityExtension(this);
this.enrichment = new EnrichmentExtension(this);
this.webhooks = new WebhookExtension(this);
this.search = new SearchExtension(this);

if (options.withoutExtensions) {
this.crowdsourcing = new CrowdsourcingApi(this, null, captureModelDataSources);
return;
}

this.webhooks = new WebhookExtension(this);
this.pageBlocks = new PageBlockExtension(this, getDefaultPageBlockDefinitions());
this.media = new MediaExtension(this);
this.system = new SystemExtension(this);
Expand Down Expand Up @@ -1077,6 +1077,13 @@ export class ApiClient {
return this.request<{ found: boolean; transcription: string }>(`/api/madoc/iiif/canvases/${id}/plaintext`);
}

async updateCanvasPlaintext(id: number, plaintext: string) {
return this.request<{ success: boolean; empty: boolean }>(`/api/madoc/iiif/canvases/${id}/plaintext`, {
method: 'POST',
body: { plaintext },
});
}

async getCanvasDeletionSummary(id: number) {
return this.request<CanvasDeletionSummary>(`/api/madoc/iiif/canvases/${id}/deletion-summary`);
}
Expand Down Expand Up @@ -1757,6 +1764,12 @@ export class ApiClient {
});
}

async deleteLinkingProperty(id: number) {
return this.request(`/api/madoc/iiif/linking/${id}`, {
method: 'DELETE',
});
}

async getStorageRaw(bucket: string, fileName: string, isPublic = false) {
return this.request<Response>(
isPublic ? `/api/storage/data/${bucket}/public/${fileName}` : `/api/storage/data/${bucket}/${fileName}`,
Expand Down
4 changes: 4 additions & 0 deletions services/madoc-ts/src/router.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@ import { keyRegenerate } from './routes/admin/key-regenerate';
import { listApiKeys } from './routes/admin/list-api-keys';
import { getProjectAnnotationStyle } from './routes/annotation-styles/get-project-annotation-style';
import { annotationStyles } from './routes/annotation-styles/index';
import { manifestEnrichmentPipeline } from "./routes/enrichment/manifest-enrichment-pipeline";
import { searchAllUsers } from './routes/global/search-all-users';
import { systemCheck } from './routes/global/system-check';
import { addPlaintext } from './routes/iiif/linking/add-plaintext';
import { getAutomatedUsers } from './routes/manage-site/get-automated-users';
import { createProjectExport } from './routes/projects/create-project-export';
import { getProjectRawData } from './routes/projects/get-project-raw-data';
Expand Down Expand Up @@ -415,6 +417,7 @@ export const router = new TypedRouter({
'get-manifest-linking': [TypedRouter.GET, '/api/madoc/iiif/manifests/:id/linking', getLinking],
'get-manifest-canvas-linking': [TypedRouter.GET, '/api/madoc/iiif/manifests/:id/canvas-linking', getParentLinking],
'search-index-manifest': [TypedRouter.POST, '/api/madoc/iiif/manifests/:id/index', indexManifest],
'search-enrich-manifest': [TypedRouter.POST, '/api/madoc/iiif/manifests/:id/enrichment', manifestEnrichmentPipeline],

// Canvas API
'list-canvases': [TypedRouter.GET, '/api/madoc/iiif/canvases', listCanvases],
Expand All @@ -432,6 +435,7 @@ export const router = new TypedRouter({
'search-index-canvas': [TypedRouter.POST, '/api/madoc/iiif/canvases/:id/index', indexCanvas],
'convert-linking-property': [TypedRouter.POST, '/api/madoc/iiif/linking/:id/convert', convertLinking],
'get-canvas-plaintext': [TypedRouter.GET, '/api/madoc/iiif/canvases/:id/plaintext', getCanvasPlaintext],
'update-canvas-plaintext': [TypedRouter.POST, '/api/madoc/iiif/canvases/:id/plaintext', addPlaintext],
'get-canvas-source': [TypedRouter.GET, '/api/madoc/iiif/canvas-source', getCanvasReference],
'get-canvas-deletion-summary': [
TypedRouter.GET,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import invariant from 'tiny-invariant';
import { api } from '../../gateway/api.server';
import { RouteMiddleware } from '../../types/route-middleware';
import { parseUrn } from '../../utility/parse-urn';
import { userWithScope } from '../../utility/user-with-scope';
import { IncomingWebhook, WebhookEventType } from '../../webhooks/webhook-types';

export const manifestEnrichmentPipeline: RouteMiddleware<{ id: number }> = async context => {
const { siteId } = userWithScope(context, ['site.admin']);
const site = await context.siteManager.getSiteById(siteId);
const siteApi = api.asUser({ siteId });

// 12-hour token.
const webhook = await context.webhookExtension.generateWebhookUrl(
site,
manifestEnrichmentPipelineEvent.event_id,
12 * 3600
);
context.response.body = await siteApi.enrichment.enrichManifestInternal(context.params.id, webhook);
};

export const manifestEnrichmentPipelineEvent: WebhookEventType = {
event_id: 'manifest-enrichment-pipeline.complete',
body_variables: ['id'],
};

export const manifestEnrichmentHook: IncomingWebhook = {
type: 'manifest-enrichment-pipeline-task-ingest',
event_id: 'manifest-enrichment-pipeline.complete',
is_outgoing: false,
execute: async (resp, siteApi) => {
invariant(resp.id, 'Expected response to contain `id`');

const task = await siteApi.enrichment.getEnrichmentTask(resp.id);
invariant(task.subject, 'Missing subject on task');
invariant(task.status === 3, 'Task is not yet complete');

if (task.task_type === 'ocr_madoc_resource') {
const parsed = parseUrn(task.subject);
invariant(parsed, 'Invalid subject');
invariant(parsed.type === 'canvas', 'Can only process canvases');

if (task.state && task.state.ocr_resources && task.state.ocr_resources[0]) {
const first = task.state.ocr_resources[0];
const enrichmentPlaintext = await siteApi.enrichment.getEnrichmentPlaintext(first);
invariant(enrichmentPlaintext, 'Missing plaintext from enrichment');
if (enrichmentPlaintext.plaintext) {
const canvasId = parsed.id; // ??
return await siteApi.updateCanvasPlaintext(canvasId, enrichmentPlaintext.plaintext);
}
}
}
},
};
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the main part (other parts mostly filling gaps in Madocs API).

  • manifestEnrichmentPipeline is the API route handler when an admin hits "Enrich"
    • Generates a webhook URL (12-hour token)
    • Creates the enrichment task
    • Returns task (@mattmcgrattan it would be useful to omit callback_url from the state in the future)
  • manifestEnrichmentPipelineEvent is a short description of the webhook "type" and the fields expected in the response.
  • manifestEnrichmentHook this is the function that is called when the task is complete. We get the webhook post-body JSON and an instance of the siteApi (already mapped correctly to the right site).
    • Fetch the enrichment task
    • Validate that it has a subject + is complete
    • If it's an ocr_madoc_resource:
      • Parse + validate the subject
      • check for ocr_resources (@mattmcgrattan will need to change if there are more than one here)
      • Fetch the plaintext + validate what we expect to see
      • Attach the plaintext to the canvas.

53 changes: 53 additions & 0 deletions services/madoc-ts/src/routes/iiif/linking/add-plaintext.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import { api } from '../../../gateway/api.server';
import { RouteMiddleware } from '../../../types/route-middleware';
import { userWithScope } from '../../../utility/user-with-scope';
import { linkHash } from './convert-linking';

export const addPlaintext: RouteMiddleware<{ id: number }, { plaintext: string }> = async context => {
const { siteId } = userWithScope(context, ['site.admin']);
const canvasId = Number(context.query.id);
const plaintext = context.requestBody.plaintext;

const siteApi = api.asUser({ siteId });
const linking = await siteApi.getCanvasLinking(canvasId);

if (!plaintext.trim()) {
context.response.status = 200;
context.response.body = { success: true, empty: true };
return;
}

const matchingPlaintexts = linking.linking.filter(singleLink => {
return singleLink.property === 'seeAlso' && singleLink.link.format === 'text/plain';
});

if (matchingPlaintexts.length) {
for (const matchingPlaintext of matchingPlaintexts) {
// Delete the existing one, and continue;
await siteApi.deleteLinkingProperty(matchingPlaintext.id);
}
}

// Create new plaintext and insert it.
const bucket = 'plaintext';
const filePath = `public/${canvasId}/${linkHash(plaintext)}.txt`;

await siteApi.saveStoragePlainText(bucket, filePath, plaintext, true);

await siteApi.addLinkToResource({
label: 'Plaintext',
link: {
id: `/public/storage/urn:madoc:site:${siteId}/${bucket}/${filePath}`,
format: 'text/plain',
label: 'Plaintext',
type: 'Text',
file_path: `public/${canvasId}/${linkHash(plaintext)}.txt`,
file_bucket: bucket,
},
resource_id: canvasId as any,
property: 'seeAlso',
});

context.response.status = 200;
context.response.body = { success: true, empty: false };
};
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import { RequestError } from '../../../utility/errors/request-error';
import { userWithScope } from '../../../utility/user-with-scope';
import contentType from 'content-type';

function linkHash(uri: string) {
export function linkHash(uri: string) {
return createHash('sha1')
.update(uri)
.digest('hex');
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { generateId } from '../../frontend/shared/capture-models/helpers/generate-id';
import { api } from '../../gateway/api.server';
import { RouteMiddleware } from '../../types/route-middleware';
import { optionalUserWithScope } from '../../utility/user-with-scope';
import { WebhookCallRow } from '../webhook-types';
Expand All @@ -14,6 +15,8 @@ export const executeWebhookInternal: RouteMiddleware<{ event_id: string }> = asy
const results = { success: 0, fail: 0 };
const callId = generateId();

const siteApi = api.asUser({ siteId }, {}, true);

const databaseHooks = await context.webhooks.listWebhooksByEvent(eventId, siteId);

for (const databaseHook of databaseHooks) {
Expand Down Expand Up @@ -64,7 +67,7 @@ export const executeWebhookInternal: RouteMiddleware<{ event_id: string }> = asy
continue;
} else {
// Do internal thing.
result.response = (await hook.execute(body)) || {};
result.response = (await hook.execute(body, siteApi)) || {};
}
results.success++;
} catch (e) {
Expand Down
3 changes: 2 additions & 1 deletion services/madoc-ts/src/webhooks/webhook-events.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import { manifestEnrichmentPipelineEvent } from '../routes/enrichment/manifest-enrichment-pipeline';
import { WebhookEventType } from './webhook-types';

const testEvent = {
event_id: 'test-event',
body_variables: ['hello'] as const,
};

export const webhookEvents: WebhookEventType[] = [testEvent];
export const webhookEvents: WebhookEventType[] = [testEvent, manifestEnrichmentPipelineEvent];
7 changes: 5 additions & 2 deletions services/madoc-ts/src/webhooks/webhook-server-extension.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import invariant from 'tiny-invariant';
import { RegistryExtension } from '../extensions/registry-extension';
import { generateId } from '../frontend/shared/capture-models/helpers/generate-id';
import { apiGateway, gatewayHost } from '../gateway/api.server';
import { manifestEnrichmentHook } from '../routes/enrichment/manifest-enrichment-pipeline';
import { getPem, getPublicPem } from '../utility/get-pem';
import { IncomingWebhook, OutgoingWebhook } from './webhook-types';
import { JWK, JWS } from 'jose';
Expand All @@ -13,12 +14,14 @@ export class WebhookServerExtension extends RegistryExtension<IncomingWebhook |
registryName: 'webhook',
});

WebhookServerExtension.register(manifestEnrichmentHook);

WebhookServerExtension.register({
is_outgoing: false,
type: 'example-test',
event_id: 'test-event',
execute: body => {
console.log('Did this work?', body);
console.log('WebHooks - test event:', body);
},
});
}
Expand All @@ -41,7 +44,7 @@ export class WebhookServerExtension extends RegistryExtension<IncomingWebhook |
code: await this.sign({ eventId, expires, siteId: site.id }),
};

return `${internal ? apiGateway : gatewayHost}/s/${site.slug}/madoc/api/webhook?${stringify(query)}}`;
return `${internal ? apiGateway : gatewayHost}/s/${site.slug}/madoc/api/webhook?${stringify(query)}`;
}

getHooksForEvents(eventId: string, siteId: number): Array<IncomingWebhook | OutgoingWebhook> {
Expand Down
4 changes: 3 additions & 1 deletion services/madoc-ts/src/webhooks/webhook-types.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import { ApiClient } from '../gateway/api';

export interface WebhookRow {
id: string;
event_id: string;
Expand Down Expand Up @@ -43,6 +45,6 @@ export interface IncomingWebhook {
is_outgoing: false;
type: string;
event_id: string;
execute: (request: any) => any | Promise<any>;
execute: (request: any, api: ApiClient) => any | Promise<any>;
source?: { type: string; id?: string; name: string };
}