You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

68 lines
2.1KB

  1. using System.Text;
  2. using System.Text.RegularExpressions;
  3. using UglyToad.PdfPig;
  4. namespace ToolsServices
  5. {
  6. public static class PdfService
  7. {
  8. #region Méthodes publiques
  9. public static string ExtractPdfFromBytes(byte[] bytes)
  10. {
  11. using var ms = new MemoryStream(bytes);
  12. using var document = UglyToad.PdfPig.PdfDocument.Open(ms);
  13. var builder = new StringBuilder();
  14. foreach (var page in document.GetPages())
  15. {
  16. builder.AppendLine(page.Text);
  17. }
  18. return builder.ToString();
  19. }
  20. public static string ExtractTextFromPdf(string fileFullname)
  21. {
  22. LoggerService.LogInfo($"PdfService.ExtractTextFromPdf : {fileFullname}");
  23. var text = new StringBuilder();
  24. using (var document = PdfDocument.Open(fileFullname))
  25. {
  26. foreach (var page in document.GetPages())
  27. {
  28. string texte = NettoyerTexte(page.Text);
  29. text.AppendLine(texte);
  30. }
  31. }
  32. return text.ToString();
  33. }
  34. #endregion
  35. public static string NettoyerTexte(string input)
  36. {
  37. if (string.IsNullOrEmpty(input))
  38. return input;
  39. // 1. Supprimer les caractères de contrôle ASCII < 32 (sauf retour à la ligne)
  40. string cleaned = new string(input.Where(c => !char.IsControl(c) || c == '\n' || c == '\r').ToArray());
  41. // 2. Normaliser les espaces
  42. cleaned = Regex.Replace(cleaned, @"\s+", " ");
  43. // 3. Supprimer les coupures de mots (tiret suivi d’espace ou retour ligne)
  44. cleaned = Regex.Replace(cleaned, @"-\s+", "");
  45. // 4. Normaliser Unicode pour remettre les accents
  46. cleaned = cleaned.Normalize(NormalizationForm.FormC);
  47. // 5. Supprimer les caractères non imprimables restants
  48. cleaned = new string(cleaned.Where(c => c >= 32).ToArray());
  49. return cleaned.Trim();
  50. }
  51. }
  52. }