Skip to content

Commit

Permalink
Deep Search PDF to MD file conversion
Browse files Browse the repository at this point in the history
Signed-off-by: Brent Salisbury <[email protected]>
  • Loading branch information
nerdalert committed Jul 11, 2024
1 parent 5b001fd commit 3ede608
Show file tree
Hide file tree
Showing 12 changed files with 522 additions and 102 deletions.
6 changes: 6 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,9 @@ IL_GRANITE_API=<GRANITE_HOST>
IL_GRANITE_MODEL_NAME=<GRANITE_MODEL_NAME>
IL_MERLINITE_API=<MERLINITE_HOST>
IL_MERLINITE_MODEL_NAME=<MERLINITE_MODEL_NAME>

DS_USERNAME=<DEEP_SEARCH_USER>
DS_API_KEY=<DEEP_SEARCH_API_KEY>
DS_HOST=<DEEP_SEARCH_HOST>
DS_PROJ_KEY=<DEEP_PROJECT_KEY>
DS_PROJ_NAME=<DEEP_PROJ_NAME>
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ npm-debug.log
.env
*.env
coverage
lib
taxonomy
config.yaml
generated
Expand Down
144 changes: 144 additions & 0 deletions src/app/api/conversion/route.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
'use server';

import { NextResponse, NextRequest } from 'next/server';
import fetch from 'node-fetch';

interface AuthData {
access_token: string;
}

interface ConvertData {
task_id: string;
}

interface TaskStatus {
task_status: string;
result?: {
json_file_url: string;
md_file_url: string;
document_hash: string;
};
}

export async function POST(req: NextRequest) {
const { repoUrl, documentNames } = await req.json();
const USERNAME = process.env.DS_USERNAME;
const API_KEY = process.env.DS_API_KEY;
const HOST = process.env.DS_HOST;
const PROJ_KEY = process.env.DS_PROJ_KEY;
const BRANCH = 'main';

if (!USERNAME || !API_KEY || !HOST || !PROJ_KEY) {
console.error('Missing environment variables');
return NextResponse.json({ error: 'Missing environment variables' }, { status: 500 });
}

const pdfFileName = documentNames.find((name: string) => name.endsWith('.pdf'));
if (!pdfFileName) {
console.error('No PDF file found for conversion');
return NextResponse.json({ error: 'No PDF file found for conversion' }, { status: 400 });
}

const [repoOwner, repoName] = repoUrl.replace('https://github.com/', '').split('/');
const PDF_URL = `https://raw.githubusercontent.com/${repoOwner}/${repoName}/${BRANCH}/${pdfFileName}`;
console.log(`PDF URL for conversion: ${PDF_URL}`);

try {
console.log('Starting authentication...');
const authResponse = await fetch(`${HOST}/api/cps/user/v1/user/token`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
Authorization: `Basic ${Buffer.from(`${USERNAME}:${API_KEY}`).toString('base64')}`
},
body: JSON.stringify({})
});

if (!authResponse.ok) {
const error = await authResponse.text();
console.error('Error during authentication:', error);
return NextResponse.json({ error }, { status: authResponse.status });
}

const authData = (await authResponse.json()) as AuthData;
const token = authData.access_token;
console.log('Authentication successful. Token obtained.');

console.log('Starting PDF conversion...');
const convertResponse = await fetch(`${HOST}/api/cps/public/v2/project/${PROJ_KEY}/convert`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
Authorization: token
},
body: JSON.stringify({
http_source: { url: PDF_URL, headers: {} }
})
});

if (!convertResponse.ok) {
const error = await convertResponse.text();
console.error('Error during PDF conversion:', error);
return NextResponse.json({ error }, { status: convertResponse.status });
}

const convertData = (await convertResponse.json()) as ConvertData;
const taskId = convertData.task_id;
console.log(`PDF conversion started. Task ID: ${taskId}`);

console.log('Checking conversion task status...');
let taskStatus: TaskStatus = { task_status: '', result: undefined };
let isTaskComplete = false;
while (!isTaskComplete) {
const taskResponse = await fetch(`${HOST}/api/cps/public/v2/project/${PROJ_KEY}/convert_tasks/${taskId}?wait=10`, {
method: 'GET',
headers: {
Authorization: token
}
});

if (!taskResponse.ok) {
const error = await taskResponse.text();
console.error('Error during task status check:', error);
return NextResponse.json({ error }, { status: taskResponse.status });
}

const taskText = await taskResponse.text();
try {
taskStatus = JSON.parse(taskText) as TaskStatus;
} catch (parseError) {
console.error('Error parsing task status response:', taskText);
return NextResponse.json({ error: 'Failed to parse task status response' }, { status: 500 });
}

console.log(`Task status: ${taskStatus.task_status}`);

if (taskStatus.result && ['SUCCESS', 'FAILURE'].includes(taskStatus.task_status)) {
isTaskComplete = true;
} else {
await new Promise((resolve) => setTimeout(resolve, 10000)); // Wait for 10 seconds before polling again
}
}

if (taskStatus.task_status === 'FAILURE') {
console.error('PDF Conversion Task failed.');
return NextResponse.json({ error: 'PDF Conversion Task failed' }, { status: 500 });
}

const result = {
json_file_url: taskStatus.result!.json_file_url,
md_file_url: taskStatus.result!.md_file_url,
document_hash: taskStatus.result!.document_hash
};

console.log('Task completed successfully.');
console.log(`JSON file URL: ${result.json_file_url}`);
console.log(`Markdown file URL: ${result.md_file_url}`);
console.log(`Document hash: ${result.document_hash}`);

return NextResponse.json(result);
} catch (error: unknown) {
console.error('Unexpected error:', error);
return NextResponse.json({ error: (error as Error).message }, { status: 500 });
}
}
2 changes: 1 addition & 1 deletion src/app/api/pr/knowledge/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ const BASE_BRANCH = 'main';

export async function POST(req: NextRequest) {
const token = await getToken({ req, secret: process.env.NEXTAUTH_SECRET! });
console.log('GitHub Token:', token);
// console.log('GitHub Token:', token);

if (!token || !token.accessToken) {
console.error('Unauthorized: Missing or invalid access token');
Expand Down
2 changes: 1 addition & 1 deletion src/app/api/pr/skill/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ const BASE_BRANCH = 'main';

export async function POST(req: NextRequest) {
const token = await getToken({ req, secret: process.env.NEXTAUTH_SECRET! });
console.log('GitHub Token:', token);
// console.log('GitHub Token:', token);

if (!token || !token.accessToken) {
console.error('Unauthorized: Missing or invalid access token');
Expand Down
16 changes: 7 additions & 9 deletions src/app/api/upload/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@ import { getToken } from 'next-auth/jwt';
import { NextRequest } from 'next/server';

const GITHUB_API_URL = 'https://api.github.com';
const TAXONOMY_DOCUMENTS_REPO = process.env.TAXONOMY_DOCUMENTS_REPO!;
const TAXONOMY_DOCUMENTS_REPO = process.env.NEXT_PUBLIC_TAXONOMY_DOCUMENTS_REPO!;
const BASE_BRANCH = 'main';

export async function POST(req: NextRequest) {
const token = await getToken({ req, secret: process.env.NEXTAUTH_SECRET! });
console.log('GitHub Token:', token);
// console.log('GitHub Token:', token);

if (!token || !token.accessToken) {
console.error('Unauthorized: Missing or invalid access token');
Expand Down Expand Up @@ -64,7 +64,8 @@ export async function POST(req: NextRequest) {
const [name, extension] = file.fileName.split(/\.(?=[^.]+$)/);
return {
fileName: `${name}-${timestamp}.${extension}`,
fileContent: file.fileContent
fileContent: file.fileContent,
encoding: extension === 'pdf' ? 'base64' : 'utf-8'
};
});

Expand Down Expand Up @@ -160,7 +161,7 @@ async function createFilesCommit(
owner: string,
repo: string,
branchName: string,
files: { fileName: string; fileContent: string }[],
files: { fileName: string; fileContent: string; encoding: string }[],
userEmail: string,
baseSha: string
): Promise<string> {
Expand All @@ -173,7 +174,7 @@ async function createFilesCommit(
headers,
body: JSON.stringify({
content: file.fileContent,
encoding: 'utf-8'
encoding: file.encoding
})
}).then((response) => response.json())
)
Expand Down Expand Up @@ -202,12 +203,9 @@ async function createFilesCommit(
}

const treeData = await createTreeResponse.json();
console.log('Tree created:', treeData);
// console.log('Tree created:', treeData);

// Create commit with DCO sign-off
// TODO: if the user's github does not have an associated github email, we need to specify one in the upload section
// or reuse the one from the form. If we use the email field from the form, it needs to be null checked when
// the user clicks the upload documents button.
const createCommitResponse = await fetch(`${GITHUB_API_URL}/repos/${owner}/${repo}/git/commits`, {
method: 'POST',
headers,
Expand Down
4 changes: 2 additions & 2 deletions src/app/edit-submission/knowledge/[id]/page.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -503,7 +503,7 @@ Creator names: ${updatedAttributionData.creator_names}
className={useFileUpload ? 'button-active' : 'button-secondary'}
onClick={() => setUseFileUpload(true)}
>
Automatically Upload Documents
Upload Documents
</Button>
</div>
</FormGroup>
Expand Down Expand Up @@ -537,7 +537,7 @@ Creator names: ${updatedAttributionData.creator_names}
</FormGroup>
) : (
<>
<UploadFile onFilesChange={handleFilesChange} />
<UploadFile onFilesChange={handleFilesChange} files={uploadedFiles} isConverting={false} conversionMessage="" />
<Button variant="primary" onClick={handleDocumentUpload}>
Submit Files
</Button>
Expand Down
Loading

0 comments on commit 3ede608

Please sign in to comment.