PDFium导出pdf 图像
✅ 修正版:支持递归提取表单中的图片对象
我在你的基础上只增加了一个 递归遍历函数,不影响你现有逻辑和日志输出。
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include <sstream>
#include <iomanip>
#include <filesystem>#include "fpdfview.h"
#include "fpdf_edit.h"
#include "fpdf_save.h"// stb_image_write 实现 JPEG 保存
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "stb_image_write.h"bool convert_to_rgb(FPDF_BITMAP bitmap, std::vector<unsigned char>& rgb_data) {int format = FPDFBitmap_GetFormat(bitmap);int width = FPDFBitmap_GetWidth(bitmap);int height = FPDFBitmap_GetHeight(bitmap);int stride = FPDFBitmap_GetStride(bitmap);unsigned char* buffer = (unsigned char*)FPDFBitmap_GetBuffer(bitmap);rgb_data.resize(width * height * 3);switch (format) {case FPDFBitmap_Gray:for (int y = 0; y < height; ++y) {unsigned char* src = buffer + y * stride;for (int x = 0; x < width; ++x) {unsigned char gray = src[x];rgb_data[(y * width + x) * 3 + 0] = gray;rgb_data[(y * width + x) * 3 + 1] = gray;rgb_data[(y * width + x) * 3 + 2] = gray;}}break;case FPDFBitmap_BGR:for (int y = 0; y < height; ++y) {unsigned char* src = buffer + y * stride;for (int x = 0; x < width; ++x) {rgb_data[(y * width + x) * 3 + 0] = src[x * 3 + 2];rgb_data[(y * width + x) * 3 + 1] = src[x * 3 + 1];rgb_data[(y * width + x) * 3 + 2] = src[x * 3 + 0];}}break;case FPDFBitmap_BGRx:case FPDFBitmap_BGRA:case FPDFBitmap_BGRA_Premul:for (int y = 0; y < height; ++y) {unsigned char* src = buffer + y * stride;for (int x = 0; x < width; ++x) {rgb_data[(y * width + x) * 3 + 0] = src[x * 4 + 2];rgb_data[(y * width + x) * 3 + 1] = src[x * 4 + 1];rgb_data[(y * width + x) * 3 + 2] = src[x * 4 + 0];}}break;default:std::cerr << "❌ Unsupported bitmap format: " << format << std::endl;return false;}return true;
}std::string format_to_string(int format) {switch (format) {case FPDFBitmap_Unknown: return "Unknown";case FPDFBitmap_Gray: return "Gray";case FPDFBitmap_BGR: return "BGR";case FPDFBitmap_BGRx: return "BGRx";case FPDFBitmap_BGRA: return "BGRA";case FPDFBitmap_BGRA_Premul: return "BGRA_Premul";default: return "Unknown(" + std::to_string(format) + ")";}
}// 🧩 递归提取函数,支持 Form 对象中的图片
void ExtractImagesFromObject(FPDF_DOCUMENT doc, FPDF_PAGE page, FPDF_PAGEOBJECT obj,const std::string& output_dir, int page_index, int& image_counter) {if (!obj) return;int type = FPDFPageObj_GetType(obj);// 如果是 Image 类型,导出if (type == FPDF_PAGEOBJ_IMAGE) {unsigned int logical_w = 0, logical_h = 0;FPDFImageObj_GetImagePixelSize(obj, &logical_w, &logical_h);FPDF_BITMAP bitmap = FPDFImageObj_GetRenderedBitmap(doc, page, obj);if (!bitmap) return;int bmp_w = FPDFBitmap_GetWidth(bitmap);int bmp_h = FPDFBitmap_GetHeight(bitmap);int bmp_format = FPDFBitmap_GetFormat(bitmap);std::cout << "🖼️ Image " << image_counter<< ": logical(" << logical_w << "x" << logical_h<< "), bitmap(" << bmp_w << "x" << bmp_h<< "), format=" << format_to_string(bmp_format) << std::endl;if (bmp_w <= 0 || bmp_h <= 0) {std::cerr << "⚠️ Invalid bitmap size, skipping.\n";FPDFBitmap_Destroy(bitmap);return;}std::vector<unsigned char> rgb_data;if (!convert_to_rgb(bitmap, rgb_data)) {FPDFBitmap_Destroy(bitmap);return;}// 输出路径std::ostringstream oss;oss << output_dir << "/page_" << std::setw(2) << std::setfill('0') << page_index<< "_img_" << std::setw(3) << std::setfill('0') << image_counter << ".jpg";std::string output_path = oss.str();if (stbi_write_jpg(output_path.c_str(), bmp_w, bmp_h, 3, rgb_data.data(), 90)) {std::cout << "✅ Saved: " << output_path << std::endl;} else {std::cerr << "❌ Failed to write JPEG: " << output_path << std::endl;}image_counter++;FPDFBitmap_Destroy(bitmap);}// 如果是 Form 对象,递归进入if (type == FPDF_PAGEOBJ_FORM) {int sub_count = FPDFFormObj_CountObjects(obj);for (int k = 0; k < sub_count; ++k) {FPDF_PAGEOBJECT sub_obj = FPDFFormObj_GetObject(obj, k);ExtractImagesFromObject(doc, page, sub_obj, output_dir, page_index, image_counter);}}
}int main() {FPDF_InitLibrary();const std::string input_pdf = "D:/BugPdf/bug.pdf";const std::string output_dir = "D:/image_out";FPDF_DOCUMENT document = FPDF_LoadDocument(input_pdf.c_str(), nullptr);if (!document) {unsigned long error = FPDF_GetLastError();std::cerr << "❌ Failed to load PDF document: " << input_pdf<< " (Error: " << error << ")" << std::endl;FPDF_DestroyLibrary();return -1;}int page_count = FPDF_GetPageCount(document);std::cout << "📄 Document has " << page_count << " pages." << std::endl;int total_images_exported = 0;for (int i = 0; i < page_count; i++) {FPDF_PAGE page = FPDF_LoadPage(document, i);if (!page) continue;int obj_count = FPDFPage_CountObjects(page);std::cout << "📄 Page " << i << " has " << obj_count << " objects." << std::endl;for (int j = 0; j < obj_count; j++) {FPDF_PAGEOBJECT obj = FPDFPage_GetObject(page, j);ExtractImagesFromObject(document, page, obj, output_dir, i, total_images_exported);}FPDF_ClosePage(page);}FPDF_CloseDocument(document);FPDF_DestroyLibrary();std::cout << "\n🎉 Done! Exported " << total_images_exported << " images.\n";return 0;
}
✅ 新功能说明
功能 | 说明 |
---|---|
🧩 支持递归 Form 对象 | 自动深入表单对象中的嵌套图像 |
🖼️ 图像计数全局递增 | 输出的文件名连续编号,不会覆盖 |
💪 完全兼容旧 PDFium 接口 | 不需要修改库,只调用公开 API |
🚫 不崩溃防护保留 | 异常位图尺寸仍会安全跳过 |
自动检测透明度的版本
完美 👍 那我来给你一个最终升级版:
✅ 支持 递归提取表单中的图像
✅ 自动判断 透明通道 → PNG、不透明 → JPEG
✅ 保留安全检查与日志输出
✅ 命名连续编号,防止覆盖
💎 最终版本:自动透明检测输出(PNG/JPEG)
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include <sstream>
#include <iomanip>
#include <filesystem>#include "fpdfview.h"
#include "fpdf_edit.h"
#include "fpdf_save.h"// ---------------- STB Image Write ----------------
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "stb_image_write.h"// 创建目录
bool create_directory(const std::string& path) {try {return std::filesystem::create_directories(path);} catch (const std::exception& e) {std::cerr << "❌ Failed to create directory: " << path<< " - " << e.what() << std::endl;return false;}
}// 将 PDFium 位图转换为 RGB(A)
bool convert_to_rgb(FPDF_BITMAP bitmap, std::vector<unsigned char>& out_data, bool& has_alpha) {int format = FPDFBitmap_GetFormat(bitmap);int width = FPDFBitmap_GetWidth(bitmap);int height = FPDFBitmap_GetHeight(bitmap);int stride = FPDFBitmap_GetStride(bitmap);unsigned char* buffer = (unsigned char*)FPDFBitmap_GetBuffer(bitmap);has_alpha = false;switch (format) {case FPDFBitmap_Gray:out_data.resize(width * height * 3);for (int y = 0; y < height; ++y) {unsigned char* src = buffer + y * stride;for (int x = 0; x < width; ++x) {unsigned char gray = src[x];out_data[(y * width + x) * 3 + 0] = gray;out_data[(y * width + x) * 3 + 1] = gray;out_data[(y * width + x) * 3 + 2] = gray;}}break;case FPDFBitmap_BGR:out_data.resize(width * height * 3);for (int y = 0; y < height; ++y) {unsigned char* src = buffer + y * stride;for (int x = 0; x < width; ++x) {out_data[(y * width + x) * 3 + 0] = src[x * 3 + 2];out_data[(y * width + x) * 3 + 1] = src[x * 3 + 1];out_data[(y * width + x) * 3 + 2] = src[x * 3 + 0];}}break;case FPDFBitmap_BGRx:out_data.resize(width * height * 3);for (int y = 0; y < height; ++y) {unsigned char* src = buffer + y * stride;for (int x = 0; x < width; ++x) {out_data[(y * width + x) * 3 + 0] = src[x * 4 + 2];out_data[(y * width + x) * 3 + 1] = src[x * 4 + 1];out_data[(y * width + x) * 3 + 2] = src[x * 4 + 0];}}break;case FPDFBitmap_BGRA:case FPDFBitmap_BGRA_Premul:has_alpha = true;out_data.resize(width * height * 4);for (int y = 0; y < height; ++y) {unsigned char* src = buffer + y * stride;for (int x = 0; x < width; ++x) {out_data[(y * width + x) * 4 + 0] = src[x * 4 + 2]; // Rout_data[(y * width + x) * 4 + 1] = src[x * 4 + 1]; // Gout_data[(y * width + x) * 4 + 2] = src[x * 4 + 0]; // Bout_data[(y * width + x) * 4 + 3] = src[x * 4 + 3]; // A}}break;default:std::cerr << "❌ Unsupported bitmap format: " << format << std::endl;return false;}return true;
}// 格式说明
std::string format_to_string(int format) {switch (format) {case FPDFBitmap_Unknown: return "Unknown";case FPDFBitmap_Gray: return "Gray";case FPDFBitmap_BGR: return "BGR";case FPDFBitmap_BGRx: return "BGRx";case FPDFBitmap_BGRA: return "BGRA";case FPDFBitmap_BGRA_Premul: return "BGRA_Premul";default: return "Unknown(" + std::to_string(format) + ")";}
}// 递归提取图片(支持 Form)
void ExtractImagesFromObject(FPDF_DOCUMENT doc, FPDF_PAGE page, FPDF_PAGEOBJECT obj,const std::string& output_dir, int page_index, int& image_counter) {if (!obj) return;int type = FPDFPageObj_GetType(obj);// Image 对象if (type == FPDF_PAGEOBJ_IMAGE) {unsigned int logical_w = 0, logical_h = 0;FPDFImageObj_GetImagePixelSize(obj, &logical_w, &logical_h);FPDF_BITMAP bitmap = FPDFImageObj_GetRenderedBitmap(doc, page, obj);if (!bitmap) return;int bmp_w = FPDFBitmap_GetWidth(bitmap);int bmp_h = FPDFBitmap_GetHeight(bitmap);int bmp_format = FPDFBitmap_GetFormat(bitmap);std::cout << "🖼️ Image " << image_counter<< ": logical(" << logical_w << "x" << logical_h<< "), bitmap(" << bmp_w << "x" << bmp_h<< "), format=" << format_to_string(bmp_format) << std::endl;if (bmp_w <= 0 || bmp_h <= 0) {std::cerr << "⚠️ Invalid bitmap size, skipping.\n";FPDFBitmap_Destroy(bitmap);return;}std::vector<unsigned char> img_data;bool has_alpha = false;if (!convert_to_rgb(bitmap, img_data, has_alpha)) {FPDFBitmap_Destroy(bitmap);return;}// 输出文件名std::ostringstream oss;oss << output_dir << "/page_" << std::setw(2) << std::setfill('0') << page_index<< "_img_" << std::setw(3) << std::setfill('0') << image_counter<< (has_alpha ? ".png" : ".jpg");std::string output_path = oss.str();bool success = false;if (has_alpha) {success = stbi_write_png(output_path.c_str(), bmp_w, bmp_h, 4, img_data.data(), bmp_w * 4);} else {success = stbi_write_jpg(output_path.c_str(), bmp_w, bmp_h, 3, img_data.data(), 90);}if (success)std::cout << "✅ Saved: " << output_path << std::endl;elsestd::cerr << "❌ Failed to write: " << output_path << std::endl;image_counter++;FPDFBitmap_Destroy(bitmap);}// Form 对象递归if (type == FPDF_PAGEOBJ_FORM) {int sub_count = FPDFFormObj_CountObjects(obj);for (int k = 0; k < sub_count; ++k) {FPDF_PAGEOBJECT sub_obj = FPDFFormObj_GetObject(obj, k);ExtractImagesFromObject(doc, page, sub_obj, output_dir, page_index, image_counter);}}
}int main() {FPDF_InitLibrary();const std::string input_pdf = "D:/BugPdf/bug.pdf";const std::string output_dir = "D:/image_out";create_directory(output_dir);FPDF_DOCUMENT document = FPDF_LoadDocument(input_pdf.c_str(), nullptr);if (!document) {unsigned long error = FPDF_GetLastError();std::cerr << "❌ Failed to load PDF document: " << input_pdf<< " (Error: " << error << ")" << std::endl;FPDF_DestroyLibrary();return -1;}int page_count = FPDF_GetPageCount(document);std::cout << "📄 Document has " << page_count << " pages.\n";int total_images = 0;for (int i = 0; i < page_count; i++) {FPDF_PAGE page = FPDF_LoadPage(document, i);if (!page) continue;int obj_count = FPDFPage_CountObjects(page);std::cout << "\n📄 Page " << i << " has " << obj_count << " objects.\n";for (int j = 0; j < obj_count; j++) {FPDF_PAGEOBJECT obj = FPDFPage_GetObject(page, j);ExtractImagesFromObject(document, page, obj, output_dir, i, total_images);}FPDF_ClosePage(page);}FPDF_CloseDocument(document);FPDF_DestroyLibrary();std::cout << "\n🎉 Done! Exported " << total_images << " images.\n";return 0;
}
✨ 改进要点
功能 | 说明 |
---|---|
🧩 递归 Form 支持 | 可提取表单对象内的嵌套图片 |
🧠 透明通道检测 | BGRA / Premul → PNG |
🪶 普通图片 → JPEG | 自动区分输出格式 |
🧱 安全检查 | 所有宽高、缓冲区安全验证保留 |
💾 自动创建输出目录 | 避免路径不存在报错 |
📸 连续命名 | page_XX_img_XXX 格式防覆盖 |