You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
46 lines
1.3 KiB
TypeScript
46 lines
1.3 KiB
TypeScript
import fs from "fs";
|
|
import mammoth from "mammoth";
|
|
import pdfParse from "pdf-parse";
|
|
import { NodeHtmlMarkdown } from "node-html-markdown";
|
|
|
|
export default async function extractTextFromFile({
|
|
filepath,
|
|
filetype,
|
|
}: {
|
|
filepath: string;
|
|
filetype: string;
|
|
}): Promise<string> {
|
|
const buffer: Buffer = await new Promise((resolve, reject) => {
|
|
const fileStream = fs.createReadStream(filepath);
|
|
const chunks: any[] = [];
|
|
fileStream.on("data", (chunk) => {
|
|
chunks.push(chunk);
|
|
});
|
|
fileStream.on("error", (error) => {
|
|
reject(error);
|
|
});
|
|
fileStream.on("end", () => {
|
|
resolve(Buffer.concat(chunks));
|
|
});
|
|
});
|
|
|
|
// Handle different file types using different modules
|
|
switch (filetype) {
|
|
case "application/pdf":
|
|
const pdfData = await pdfParse(buffer);
|
|
return pdfData.text;
|
|
case "application/vnd.openxmlformats-officedocument.wordprocessingml.document": // i.e. docx file
|
|
const docxResult = await mammoth.extractRawText({ path: filepath });
|
|
return docxResult.value;
|
|
case "text/markdown":
|
|
case "text/csv":
|
|
case "text/html":
|
|
const html = buffer.toString();
|
|
return NodeHtmlMarkdown.translate(html);
|
|
case "text/plain":
|
|
return buffer.toString();
|
|
default:
|
|
throw new Error("Unsupported file type");
|
|
}
|
|
}
|