diff --git a/CHANGELOG.md b/CHANGELOG.md index 08b5f89..5234770 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.12.0] - 2025-01-09 +### Added +* Added document minification as a feature before document translation, to + allow translation of large docx or pptx files. For more info check the README. ## [1.11.0] - 2024-11-15 ### Added @@ -169,7 +173,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [0.1.0] - 2021-11-05 Initial release. - +[1.12.0]: https://github.com/DeepLcom/deepl-dotnet/compare/v1.11.0...v1.12.0 [1.11.0]: https://github.com/DeepLcom/deepl-dotnet/compare/v1.10.0...v1.11.0 [1.10.0]: https://github.com/DeepLcom/deepl-dotnet/compare/v1.9.0...v1.10.0 [1.9.0]: https://github.com/DeepLcom/deepl-dotnet/compare/v1.8.0...v1.9.0 diff --git a/DeepL/DeepL.csproj b/DeepL/DeepL.csproj index 14534dc..a4d2d3a 100644 --- a/DeepL/DeepL.csproj +++ b/DeepL/DeepL.csproj @@ -3,9 +3,9 @@ DeepL.net is the official DeepL .NET client library. DeepL.net - 1.11.0 - 1.11.0 - 1.11.0.0 + 1.12.0 + 1.12.0 + 1.12.0.0 1.0.0.0 net5.0;netstandard2.0 8 @@ -32,14 +32,14 @@ - - + + - - + + diff --git a/DeepL/DeepLException.cs b/DeepL/DeepLException.cs index 760334f..8c185e1 100644 --- a/DeepL/DeepLException.cs +++ b/DeepL/DeepLException.cs @@ -103,4 +103,30 @@ public DocumentTranslationException(string message, Exception innerException, Do /// The handle can be used to later retrieve the document or to contact DeepL support. public DocumentHandle? DocumentHandle { get; } } + + /// + /// Exception thrown if an error occurs during the minification phase of document minification. + /// See + /// + public sealed class DocumentMinificationException : DeepLException { + /// Initializes a new instance of the class. + /// The message that describes the error. + /// The exception representing the connection error. + public DocumentMinificationException(string message, Exception innerException) : + base(message, innerException) { + } + } + + /// + /// Exception thrown if an error occurs during the minification phase of document deminification. + /// See + /// + public sealed class DocumentDeminificationException : DeepLException { + /// Initializes a new instance of the class. + /// The message that describes the error. + /// The exception representing the connection error. + public DocumentDeminificationException(string message, Exception innerException) : + base(message, innerException) { + } + } } diff --git a/DeepL/DocumentMinifier.cs b/DeepL/DocumentMinifier.cs new file mode 100644 index 0000000..f9eff14 --- /dev/null +++ b/DeepL/DocumentMinifier.cs @@ -0,0 +1,388 @@ +// Copyright 2022 DeepL SE (https://www.deepl.com) +// Use of this source code is governed by an MIT +// license that can be found in the LICENSE file. +using System; +using System.IO; +using System.IO.Compression; +using System.Linq; +using System.Threading; + +namespace DeepL { + public interface IDocumentMinifier { + /// + /// Minifies a given document using the given tempDir, by extracting it as a ZIP file and + /// replacing all supported media files with a small placeholder. + /// Created file will be inside the tempDir, the filename can be retrieved by calling + /// with tempDir as a parameter + /// Note that this method will minify the file without any checks, you should first call + /// on the input file. + /// If cleanup is set to true , the extracted document will be deleted afterwards, and only + /// the original media and the minified file will remain in the tempDir. + /// + /// Path to the file to be minified. + /// + /// If true, will delete the extracted document files from the temporary directory. + /// Otherwise, the files will remain (useful for debugging). + /// + /// + /// The path of the minified document. Can also be retrieved by calling + /// + /// + /// + /// If an exception occurred during the minification process + /// + public string MinifyDocument(string inputFilePath, bool cleanup = false); + + /// + /// Deminifies a given file at inputFilePath by reinserting its original media in tempDir and stores + /// the resulting document in outputFilePath. If cleanup is set to true, it will delete the + /// tempDir afterwards, otherwise nothing will happen after the deminification. + /// + /// Path to document to be deminified with its media. + /// Where the final (deminified) document will be stored. + /// Determines if the tempDir is deleted at the end of this method. + /// + /// If an exception occurred during the deminification process + /// + public void DeminifyDocument(string inputFilePath, string outputFilePath, bool cleanup = false); + } + + /// + /// Class that implements document minification: Stripping supported files like pptx and docx + /// of their media (images, videos, etc) before uploading them to the DeepL API to be translated. + /// This allows users to translate files that would usually hit the size limit for files. + /// Please note the following: + /// + /// + /// + /// To use this class, you first need to check by calling + /// if the file type is supported. This class performs no further checks. + /// + /// + /// + /// + /// The DocumentMinifier is stateful, so you cannot use it to minify multiple documents at once. + /// You need to create a new DocumentMinifier object per document. + /// + /// + /// + /// + /// Be very careful when providing a custom tempDir when instantiating the class. For example, + /// will delete the entire tempDir with + /// cleanup set to true (disabled by default). In order not to lose any data, ideally always + /// call new DocumentMinifier() in order to get a fresh temporary directory. + /// + /// + /// + /// + /// If an error occurs during minification, either a or a + /// will be thrown, depending on which phase the error + /// occured in. + /// + /// + /// + /// The document minification process works in 2 phases: + /// + /// + /// + /// Minification: The document is extracted into a temporary directory, the media files are backed up, + /// the media in the document is replaced with placeholders and a minified document is created. + /// + /// + /// + /// + /// Deminification: The minified document is extracted into a temporary directory, the media backups are + /// reinserted into the extracted document, and the document is deminified into the output path. + /// + /// + /// + /// If cleanup is enabled, the minification phase will delete the folder with the extracted document + /// and the deminification phase will delete the entire temporary directory. + /// Note that by default, the input file will be kept on disk, and as such no further backups of media etc. + /// are made (as they are all available from the input file). + /// Example usage: + /// + /// var inputFile = "/home/exampleUser/document.pptx"; + /// var outputFile = "/home/exampleUser/document_ES.pptx"; + /// var minifier = new DocumentMinifier(); + /// if (minifier.CanMinifyFile(inputFile)) { + /// try { + /// minifier.MinifyDocument(inputFile, true); + /// minifiedFile = minifier.GetMinifiedDocFile(inputFile); + /// // process file minifiedFile, e.g. translate it with DeepL + /// minifier.DeminifyDocument(inputFile, outputFile, true); + /// // process file outputFile + /// } catch (DocumentMinificationException e) { + /// // handle exception during minification, e.g. print list of media, clean up temporary directory, etc + /// } catch (DocumentDeminificationException e) { + /// // handle exception during deminification, e.g. save minified document, clean up temporary directory, etc + /// } catch (DocumentTranslationException e) { + /// // handle general DocTrans exception (mostly useful if document is translated between minification + /// // and deminification) + /// } + /// } + /// + /// + public class DocumentMinifier : IDocumentMinifier { + /// Which input document types are supported for minification. + private static readonly string[] SupportedDocumentTypes = { ".pptx", ".docx" }; + + /// Which media formats in the documents are supported for minification. + private static readonly string[] SupportedMediaFormats = { + // Image formats + ".png", ".jpg", ".jpeg", ".emf", ".bmp", ".tiff", ".wdp", ".svg", ".gif", + // Video formats + // Taken from https://support.microsoft.com/en-gb/office/video-and-audio-file-formats-supported-in-powerpoint-d8b12450-26db-4c7b-a5c1-593d3418fb59 + ".mp4", ".asf", ".avi", ".m4v", ".mpg", ".mpeg", ".wmv", ".mov", + // Audio formats, taken from the same URL as video + ".aiff", ".au", ".mid", ".midi", ".mp3", ".m4a", ".wav", ".wma" + }; + + private const string ExtractedDocDirName = "extracted_doc"; + private const string OriginalMediaDirName = "original_media"; + private const string MinifiedDocFileBaseName = "minifiedDoc"; + private const int MinifiedDocSizeLimitWarning = 5000000; + + private readonly string _tempDir; + + /// + /// Initializes a new object either with a specified or newly created + /// temporary directory. + /// + /// The temporary directory used for media extraction during minification + public DocumentMinifier(string? tempDir = null) { + _tempDir = tempDir ?? CreateTemporaryDirectory(); + } + + /// Checks if a given file can be minified or not + /// The path to the file + /// true if the file can be minified otherwise false + /// + /// if the inputFilePath contains characters not allowed in a path name + /// + public static bool CanMinifyFile(string inputFilePath) { + return !string.IsNullOrWhiteSpace(inputFilePath) && + SupportedDocumentTypes.Contains(Path.GetExtension(inputFilePath).ToLowerInvariant()); + } + + /// Gets the path for where the minified version of the input file will live + /// The path to the file + /// The path to the minified version of the file + /// if the inputFilePath is null + /// + /// if the inputFilePath contains characters not allowed in a path name + /// + public string GetMinifiedDocFile(string inputFilePath) { + var minifiedDocFileName = Path.ChangeExtension(MinifiedDocFileBaseName, Path.GetExtension(inputFilePath)); + return Path.Combine(_tempDir, minifiedDocFileName); + } + + /// Gets the path to the directory where the input file will be extracted to + /// The path to the directory where the input file will be extracted to + public string GetExtractedDocDirectory() { + return Path.Combine(_tempDir, ExtractedDocDirName); + } + + /// Gets the path to the directory where the original media was extracted to + /// The path to the media directory containing the original media + public string GetOriginalMediaDirectory() { + return Path.Combine(_tempDir, OriginalMediaDirName); + } + + /// + public string MinifyDocument(string inputFilePath, bool cleanup = false) { + var extractedDocDirectory = GetExtractedDocDirectory(); + var mediaDir = GetOriginalMediaDirectory(); + var minifiedDocFilePath = GetMinifiedDocFile(inputFilePath); + + try { + ExtractZipTo(inputFilePath, extractedDocDirectory); + } catch (Exception ex) { + throw new DocumentMinificationException( + $"Exception when extracting document: Failed to extract {inputFilePath} to {extractedDocDirectory}", + ex); + } + + ExportMediaToMediaDirAndReplace(extractedDocDirectory, mediaDir); + + try { + ZipFile.CreateFromDirectory(extractedDocDirectory, minifiedDocFilePath); + } catch (Exception ex) { + throw new DocumentMinificationException($"Failed creating a zip file at {minifiedDocFilePath}", ex); + } + + if (cleanup) { + try { + Directory.Delete(extractedDocDirectory, true); + } catch (Exception ex) { + throw new DocumentMinificationException($"Failed to delete directory {extractedDocDirectory}", ex); + } + } + + var fileSizeResponse = new FileInfo(minifiedDocFilePath).Length; + if (fileSizeResponse > MinifiedDocSizeLimitWarning) { + Console.Error.WriteLine( + "The input file could not be minified below 5 MB, likely a media type is missing. " + + "This might cause the translation to fail."); + } + + return minifiedDocFilePath; + } + + /// + public void DeminifyDocument(string inputFilePath, string outputFilePath, bool cleanup = false) { + var extractedDocDirectory = GetExtractedDocDirectory(); + var mediaDir = GetOriginalMediaDirectory(); + if (!Directory.Exists(extractedDocDirectory)) { + try { + Directory.CreateDirectory(extractedDocDirectory); + } catch (Exception ex) { + throw new DocumentDeminificationException( + $"Exception when deminifying, could not create directory at {extractedDocDirectory}.", + ex); + } + } + + try { + ExtractZipTo(inputFilePath, extractedDocDirectory); + } catch (Exception ex) { + throw new DocumentDeminificationException( + $"Exception when extracting document: Failed to extract {inputFilePath} to {extractedDocDirectory}", + ex); + } + + ReplaceMediaInDir(extractedDocDirectory, mediaDir); + try { + if (File.Exists(outputFilePath)) { + File.Delete(outputFilePath); + } + + ZipFile.CreateFromDirectory(extractedDocDirectory, outputFilePath); + } catch (Exception ex) { + throw new DocumentDeminificationException($"Failed creating a zip file at {outputFilePath}", ex); + } + + if (cleanup) { + try { + Directory.Delete(_tempDir, true); + } catch (Exception ex) { + throw new DocumentMinificationException($"Failed to delete directory {extractedDocDirectory}", ex); + } + } + } + + /// + /// Creates a temporary directory for use in the + /// Uses the system's temporary directory. + /// + /// The path of the created temporary directory + /// if the temporary directory could not be created + private static string CreateTemporaryDirectory() { + var tempDir = Path.GetTempPath() + "/document_minification_" + Guid.NewGuid().ToString("N"); + while (Directory.Exists(tempDir)) { + Thread.Sleep(1); + tempDir = Path.GetTempPath() + "/document_minification_" + Guid.NewGuid().ToString("N"); + } + + try { + Directory.CreateDirectory(tempDir); + } catch (Exception ex) { + throw new DocumentMinificationException($"Failed creating temporary directory at {tempDir}", ex); + } + + return tempDir; + } + + /// Extracts a zip file to a given directory + /// The path to the zip file + /// + /// The path to the directory where the contents of the zip file will be extracted to + /// + private void ExtractZipTo(string zippedDocumentPath, string extractionDir) { + if (!Directory.Exists(extractionDir)) { + Directory.CreateDirectory(extractionDir); + } + + ZipFile.ExtractToDirectory(zippedDocumentPath, extractionDir); + } + + /// + /// Iterates through the inputDirectory and if it contains a supported media file, will export that media + /// to the mediaDirectory and replace the media in the inputDirectory with a placeholder. The + /// relative path will be preserved when moving the file to the mediaDirectory (e.g. a file located at + /// "/inputDirectory/foo/bar.png" will be exported to "/mediaDirectory/foo/bar.png") + /// + /// The path to the input directory + /// + /// The path to the directory where the supported media from inputDirectory will be exported to + /// + /// + /// If a problem occurred when exporting the original media from inputDirectory to mediaDirectory + /// + private void ExportMediaToMediaDirAndReplace(string inputDirectory, string mediaDirectory) { + foreach (var filePath in Directory.GetFiles(inputDirectory, "*.*", SearchOption.AllDirectories)) { + if (SupportedMediaFormats.Contains(Path.GetExtension(filePath).ToLowerInvariant())) { + var relativeFilePath = filePath.Substring(inputDirectory.Length + 1); + var mediaPath = Path.Combine(mediaDirectory, relativeFilePath); + + // mediaDir should never be null as mediaPath contains the specified mediaDirectory + var mediaDir = Path.GetDirectoryName(mediaPath); + + try { + if (!string.IsNullOrWhiteSpace(mediaDir) && !Directory.Exists(mediaDir)) { + Directory.CreateDirectory(mediaDir); + } + + File.Move(filePath, mediaPath); + File.WriteAllText(filePath, "DeepL Media Placeholder"); + } catch (Exception ex) { + throw new DocumentMinificationException($"Exception when exporting and replacing media files", ex); + } + } + } + } + + /// + /// Iterates through mediaDirectory and moves all files into the inputDirectory while preserving + /// the relative paths. (e.g. /mediaDirectory/foo/bar.png will be moved to the path /inputDirectory/foo/bar.png + /// and replace any file if it exists at that path. Any subdirectories in mediaDirectory will also be + /// created in inputDirectory). + /// + /// The path to the input directory + /// + /// The path to the directory where the original media lives. This media will be reinserted back and replace any + /// placeholder media. + /// + /// + /// If a problem occurred when trying to reinsert the media + /// + private void ReplaceMediaInDir(string inputDirectory, string mediaDirectory) { + foreach (var filePath in Directory.GetFiles(mediaDirectory, "*.*", SearchOption.AllDirectories)) { + var relativeFilePath = filePath.Substring(mediaDirectory.Length + 1); + var curMediaPath = Path.Combine(inputDirectory, relativeFilePath); + var curMediaDir = Path.GetDirectoryName(curMediaPath); + if (!string.IsNullOrWhiteSpace(curMediaDir) && !Directory.Exists(curMediaDir)) { + try { + Directory.CreateDirectory(curMediaDir); + } catch (Exception ex) { + throw new DocumentDeminificationException( + $"Exception when reinserting media. Failed to create directory at {curMediaDir}.", + ex); + } + } + + try { + if (File.Exists(curMediaPath)) { + File.Delete(curMediaPath); + } + + File.Move(filePath, curMediaPath); + } catch (Exception ex) { + throw new DocumentDeminificationException( + $"Exception when reinserting media. Failed to move media back to {curMediaPath}.", + ex); + } + } + } + } +} diff --git a/DeepL/DocumentTranslateOptions.cs b/DeepL/DocumentTranslateOptions.cs index d24101e..a2fe740 100644 --- a/DeepL/DocumentTranslateOptions.cs +++ b/DeepL/DocumentTranslateOptions.cs @@ -26,5 +26,8 @@ public DocumentTranslateOptions(GlossaryInfo glossary) : this() { /// Specifies the ID of a glossary to use with the translation. public string? GlossaryId { get; set; } + + /// Controls whether to use Document Minification for translation, if available. + public bool EnableDocumentMinification { get; set; } } } diff --git a/DeepL/Translator.cs b/DeepL/Translator.cs index 357575c..854ab67 100644 --- a/DeepL/Translator.cs +++ b/DeepL/Translator.cs @@ -546,12 +546,19 @@ public async Task TranslateDocumentAsync( string targetLanguageCode, DocumentTranslateOptions? options = null, CancellationToken cancellationToken = default) { - using var inputFile = inputFileInfo.OpenRead(); + var willMinify = (options?.EnableDocumentMinification ?? false) && DocumentMinifier.CanMinifyFile(inputFileInfo.Name); + var fileToUpload = inputFileInfo; + var minifier = new DocumentMinifier(); + if (willMinify) { + minifier.MinifyDocument(inputFileInfo.FullName, true); + fileToUpload = new FileInfo(minifier.GetMinifiedDocFile(inputFileInfo.FullName)); + } + using var inputFile = fileToUpload.OpenRead(); using var outputFile = outputFileInfo.Open(FileMode.CreateNew, FileAccess.Write); try { await TranslateDocumentAsync( inputFile, - inputFileInfo.Name, + fileToUpload.Name, outputFile, sourceLanguageCode, targetLanguageCode, @@ -565,6 +572,10 @@ await TranslateDocumentAsync( } throw; + } if (willMinify) { + outputFile.Dispose(); + // Translated minified file is at `outputFileName`. Reinsert media (deminify) before returning + minifier.DeminifyDocument(outputFileInfo.FullName, outputFileInfo.FullName, true); } } diff --git a/DeepLTests/BaseDeepLTest.cs b/DeepLTests/BaseDeepLTest.cs index 433eb36..c3f68fb 100644 --- a/DeepLTests/BaseDeepLTest.cs +++ b/DeepLTests/BaseDeepLTest.cs @@ -5,7 +5,11 @@ using System; using System.Collections.Generic; using System.IO; +using System.IO.Compression; +using System.Linq; +using System.Net; using System.Net.Http; +using System.Security.Cryptography; using System.Threading; using System.Threading.Tasks; using DeepL; @@ -17,6 +21,8 @@ public class BaseDeepLTest { protected static readonly string AuthKey; protected static readonly string? ServerUrl; protected static readonly string? ProxyUrl; + protected static readonly Dictionary DocMinificationTestFilesMapping; + private static Random _random = new Random(); static BaseDeepLTest() { if (IsMockServer) { @@ -28,7 +34,13 @@ static BaseDeepLTest() { "DEEPL_AUTH_KEY environment variable must be set unless using mock server."); ServerUrl = Environment.GetEnvironmentVariable("DEEPL_SERVER_URL"); } + ProxyUrl = Environment.GetEnvironmentVariable("DEEPL_PROXY_URL"); + DocMinificationTestFilesMapping = new Dictionary() { + { ".docx", "example_document_template.docx" }, + { ".pptx", "example_presentation_template.pptx" }, + { ".zip", "example_zip_template.zip" } + }; } protected static Translator CreateTestTranslator(bool randomAuthKey = false) { @@ -56,7 +68,7 @@ protected static Translator CreateTestTranslatorWithMockSession( } protected static MockHttpMessageHandler getMockHandler(String responseMessage) { - var response = new HttpResponseMessage(System.Net.HttpStatusCode.OK); + var response = new HttpResponseMessage(HttpStatusCode.OK); response.Content = new StringContent(responseMessage); return new MockHttpMessageHandler(response); } @@ -214,6 +226,47 @@ protected static string TempDir() { return path; } + protected static string GetFullPathForTestFile(string testFileName) { + return Path.Combine(Directory.GetCurrentDirectory(), "resources", testFileName); + } + + protected static string CreateMinifiedTestDocument(string extension, string outputDirectory) { + var extractionDir = TempDir(); + var testFilePath = GetFullPathForTestFile(DocMinificationTestFilesMapping[extension]); + var outputFilePath = Path.Combine(outputDirectory, "test_document" + extension); + ZipFile.ExtractToDirectory(testFilePath, extractionDir); + var characters = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ~!@#$%^&*()_+=-<,>.?:"; + var length = 90000000; + var createText = new string( + Enumerable.Repeat(characters, length) + .Select(s => s[_random.Next(s.Length)]) + .ToArray()); + File.WriteAllText(Path.Combine(extractionDir, "placeholder_image.png"), createText); + ZipFile.CreateFromDirectory(extractionDir, outputFilePath); + Directory.Delete(extractionDir, true); + return outputFilePath; + } + + protected bool AreDirectoriesEqual(string dir1, string dir2) { + var dir1Info = new DirectoryInfo(dir1); + var dir2Info = new DirectoryInfo(dir2); + + var dir1Files = dir1Info.GetFiles("*.*", SearchOption.AllDirectories); + var dir2Files = dir2Info.GetFiles("*.*", SearchOption.AllDirectories); + + var dir1Hashes = dir1Files.ToDictionary(k => k.Name, GetHashForFile); + var dir2Hashes = dir2Files.ToDictionary(k => k.Name, GetHashForFile); + + return dir1Hashes.Keys.Count == dir2Hashes.Keys.Count && + dir1Hashes.All(kvp => dir2Hashes.ContainsKey(kvp.Key) && dir2Hashes[kvp.Key].SequenceEqual(kvp.Value)); + } + + private byte[] GetHashForFile(FileInfo file) { + using var fileStream = file.OpenRead(); + using var md5 = MD5.Create(); + return md5.ComputeHash(fileStream); + } + protected struct SessionOptions { public int? NoResponse; public int? RespondWith429; @@ -250,17 +303,17 @@ public RealServerOnlyFact() { } } - /// /// Class to mock HTTP requests the library makes. Supports returning a constant response to every request /// through . /// If we ever need more complex mocking functionality, we should drop this and use a mocking library. /// - protected class MockHttpMessageHandler : System.Net.Http.HttpMessageHandler { + protected class MockHttpMessageHandler : HttpMessageHandler { /// /// List of requests made through this mock. Use to make assertions in your tests after the code has run. /// public List requests; + /// /// Default response returned on every HTTP request. If we need more complex functionality, /// we should use a proper mocking library, for example Moq @@ -271,7 +324,10 @@ public MockHttpMessageHandler(HttpResponseMessage response) : base() { defaultResponse = response; requests = new List(); } - protected override async Task SendAsync(HttpRequestMessage request, CancellationToken cancellationToken) { + + protected override async Task SendAsync( + HttpRequestMessage request, + CancellationToken cancellationToken) { this.requests.Add(request); await Task.Delay(0); return defaultResponse; diff --git a/DeepLTests/DeepLTests.csproj b/DeepLTests/DeepLTests.csproj index aaeb57a..9586d00 100644 --- a/DeepLTests/DeepLTests.csproj +++ b/DeepLTests/DeepLTests.csproj @@ -24,4 +24,8 @@ + + PreserveNewest + + diff --git a/DeepLTests/DocumentMinificationTest.cs b/DeepLTests/DocumentMinificationTest.cs new file mode 100644 index 0000000..e942be1 --- /dev/null +++ b/DeepLTests/DocumentMinificationTest.cs @@ -0,0 +1,127 @@ +// Copyright 2022 DeepL SE (https://www.deepl.com) +// Use of this source code is governed by an MIT +// license that can be found in the LICENSE file. + +using System.Collections.Generic; +using System.IO; +using System.IO.Compression; +using System.Threading.Tasks; +using DeepL; +using Xunit; + +namespace DeepLTests { + public sealed class DocumentMinificationTest : BaseDeepLTest { + private readonly string _tempDir = TempDir(); + + private string OutputDocumentPath(string extension) { + var path = Path.Combine(_tempDir, "output", Path.ChangeExtension("example_document", extension)); + Directory.CreateDirectory(Path.Combine(_tempDir, "output")); + File.Delete(path); + return path; + } + + [Theory] + [InlineData(".pptx")] + [InlineData(".docx")] + public void TestMinifyDocumentHappyPath(string extension) { + var minifiedTestDocument = CreateMinifiedTestDocument(extension, _tempDir); + var originalFileSize = new FileInfo(minifiedTestDocument).Length; + var minifier = new DocumentMinifier(_tempDir); + var minifiedDocumentPath = minifier.MinifyDocument(minifiedTestDocument, false); + var minifiedFileSize = new FileInfo(minifiedDocumentPath).Length; + + Assert.True(minifiedFileSize < originalFileSize); + Assert.InRange(minifiedFileSize, 100, 50000); + + // Cleanup + Directory.Delete(minifier.GetExtractedDocDirectory(), true); + Directory.Delete(minifier.GetOriginalMediaDirectory(), true); + File.Delete(minifiedTestDocument); + File.Delete(minifiedDocumentPath); + } + + [Theory] + [InlineData(true)] + [InlineData(false)] + public void TestDocumentMinificationCleansUpProperly(bool shouldCleanUp) { + var minifiedTestDocument = CreateMinifiedTestDocument(".pptx", _tempDir); + var minifier = new DocumentMinifier(_tempDir); + var minifiedDocumentPath = minifier.MinifyDocument(minifiedTestDocument, shouldCleanUp); + + Assert.Equal(shouldCleanUp, !Directory.Exists(minifier.GetExtractedDocDirectory())); + + // Cleanup + if (!shouldCleanUp) Directory.Delete(minifier.GetExtractedDocDirectory(), true); + Directory.Delete(minifier.GetOriginalMediaDirectory(), true); + File.Delete(minifiedTestDocument); + File.Delete(minifiedDocumentPath); + } + + [Fact] + public void TestDeminifyDocumentHappyPath() { + var inputFile = CreateMinifiedTestDocument(".zip", _tempDir); + var outputFile = Path.Combine(_tempDir, "example_zip_transformed.zip"); + var minifier = new DocumentMinifier(_tempDir); + var minifiedFile = minifier.MinifyDocument(inputFile, true); + minifier.DeminifyDocument(minifiedFile, outputFile, false); + + var inputExtractionDir = Path.Combine(_tempDir, "input_dir"); + var outputExtractionDir = Path.Combine(_tempDir, "output_dir"); + ZipFile.ExtractToDirectory(inputFile, inputExtractionDir); + ZipFile.ExtractToDirectory(outputFile, outputExtractionDir); + + Assert.True(AreDirectoriesEqual(inputExtractionDir, outputExtractionDir)); + + // Cleanup + Directory.Delete(_tempDir, true); + } + + [Theory] + [InlineData(true)] + [InlineData(false)] + public void TestDocumentDeminificationCleansUpProperly(bool shouldCleanUp) { + var minifiedTestDocument = CreateMinifiedTestDocument(".zip", _tempDir); + var outputFile = Path.Combine(_tempDir, "example_zip_transformed.zip"); + var minifier = new DocumentMinifier(); + var minifiedFile = minifier.MinifyDocument(minifiedTestDocument, true); + minifier.DeminifyDocument(minifiedFile, outputFile, shouldCleanUp); + + Assert.Equal(shouldCleanUp, !Directory.Exists(minifier.GetExtractedDocDirectory())); + + // Cleanup + if (!shouldCleanUp) { + Directory.Delete(minifier.GetExtractedDocDirectory(), true); + Directory.Delete(minifier.GetOriginalMediaDirectory(), true); + File.Delete(minifiedFile); + } + + File.Delete(minifiedTestDocument); + File.Delete(outputFile); + } + + [RealServerOnlyFact] + public async Task TestMinifyAndTranslateDocuments() { + var translator = CreateTestTranslator(); + var extensions = new List() { ".docx", ".pptx" }; + foreach (var extension in extensions) { + var exampleDocumentPath = CreateMinifiedTestDocument(extension, _tempDir); + var outputDocumentPath = OutputDocumentPath(extension); + + await translator.TranslateDocumentAsync( + new FileInfo(exampleDocumentPath), + new FileInfo(outputDocumentPath), + "EN", + "DE", + new DocumentTranslateOptions { EnableDocumentMinification = true }); + + // If the output exists, the input document must have been minified as TranslateDocumentAsync + // will not succeed for files over 30 MB + Assert.True(File.Exists(outputDocumentPath)); + Assert.NotInRange(new FileInfo(exampleDocumentPath).Length, 0, 30000000); + } + + // Cleanup + Directory.Delete(_tempDir, true); + } + } +} diff --git a/DeepLTests/GeneralTest.cs b/DeepLTests/GeneralTest.cs index ea1d88f..98ff54f 100644 --- a/DeepLTests/GeneralTest.cs +++ b/DeepLTests/GeneralTest.cs @@ -5,11 +5,11 @@ using System; using System.Collections.Generic; using System.IO; +using System.Net; using System.Net.Http; using System.Threading.Tasks; using DeepL; using Xunit; -using Xunit.Abstractions; namespace DeepLTests { public sealed class GeneralTest : BaseDeepLTest { @@ -18,7 +18,7 @@ public sealed class GeneralTest : BaseDeepLTest { /// [Fact] public void TestVersion() { - Assert.Equal("1.11.0", Translator.Version()); + Assert.Equal("1.12.0", Translator.Version()); // Note the assembly version must remain unchanged for binary compatibility, excepting the major version. Assert.Equal("1.0.0.0", typeof(Translator).Assembly.GetName().Version?.ToString()); @@ -43,9 +43,12 @@ public async Task TestExampleTranslation() { [Fact] public async Task TestDefaultUserAgentHeader() { var mockHandler = getMockHandler("{\"character_count\": 180118,\"character_limit\": 1250000}"); - var translator = new Translator(AuthKey, new TranslatorOptions { ClientFactory = () => new HttpClientAndDisposeFlag { - HttpClient = new HttpClient(mockHandler), DisposeClient = true, - } }); + var translator = new Translator( + AuthKey, + new TranslatorOptions { + ClientFactory = () => + new HttpClientAndDisposeFlag { HttpClient = new HttpClient(mockHandler), DisposeClient = true, } + }); var usage = await translator.GetUsageAsync(); Assert.Single(mockHandler.requests); var userAgentHeader = mockHandler.requests[0].Headers.UserAgent; @@ -57,9 +60,13 @@ public async Task TestDefaultUserAgentHeader() { [Fact] public async Task TestOptInUserAgentHeader() { var mockHandler = getMockHandler("{\"character_count\": 180118,\"character_limit\": 1250000}"); - var translator = new Translator(AuthKey, new TranslatorOptions { sendPlatformInfo = true, ClientFactory = () => new HttpClientAndDisposeFlag { - HttpClient = new HttpClient(mockHandler), DisposeClient = true, - } }); + var translator = new Translator( + AuthKey, + new TranslatorOptions { + sendPlatformInfo = true, + ClientFactory = () => + new HttpClientAndDisposeFlag { HttpClient = new HttpClient(mockHandler), DisposeClient = true, } + }); var usage = await translator.GetUsageAsync(); Assert.Single(mockHandler.requests); var userAgentHeader = mockHandler.requests[0].Headers.UserAgent; @@ -68,13 +75,16 @@ public async Task TestOptInUserAgentHeader() { Assert.Contains("dotnet-clr/", userAgentHeader.ToString()); } - [Fact] public async Task TestOptOutUserAgentHeader() { var mockHandler = getMockHandler("{\"character_count\": 180118,\"character_limit\": 1250000}"); - var translator = new Translator(AuthKey, new TranslatorOptions { sendPlatformInfo = false, ClientFactory = () => new HttpClientAndDisposeFlag { - HttpClient = new HttpClient(mockHandler), DisposeClient = true, - } }); + var translator = new Translator( + AuthKey, + new TranslatorOptions { + sendPlatformInfo = false, + ClientFactory = () => + new HttpClientAndDisposeFlag { HttpClient = new HttpClient(mockHandler), DisposeClient = true, } + }); var usage = await translator.GetUsageAsync(); Assert.Single(mockHandler.requests); var userAgentHeader = mockHandler.requests[0].Headers.UserAgent; @@ -86,9 +96,14 @@ public async Task TestOptOutUserAgentHeader() { [Fact] public async Task TestDefaultUserAgentHeaderWithAppInfo() { var mockHandler = getMockHandler("{\"character_count\": 180118,\"character_limit\": 1250000}"); - var translator = new Translator(AuthKey, new TranslatorOptions {sendPlatformInfo = true, appInfo = new AppInfo { AppName = "my-dotnet-test-app", AppVersion = "1.2.3"}, ClientFactory = () => new HttpClientAndDisposeFlag { - HttpClient = new HttpClient(mockHandler), DisposeClient = true, - }}); + var translator = new Translator( + AuthKey, + new TranslatorOptions { + sendPlatformInfo = true, + appInfo = new AppInfo { AppName = "my-dotnet-test-app", AppVersion = "1.2.3" }, + ClientFactory = () => + new HttpClientAndDisposeFlag { HttpClient = new HttpClient(mockHandler), DisposeClient = true, } + }); var usage = await translator.GetUsageAsync(); Assert.Single(mockHandler.requests); var userAgentHeader = mockHandler.requests[0].Headers.UserAgent; @@ -101,9 +116,14 @@ public async Task TestDefaultUserAgentHeaderWithAppInfo() { [Fact] public async Task TestOptInUserAgentHeaderWithAppInfo() { var mockHandler = getMockHandler("{\"character_count\": 180118,\"character_limit\": 1250000}"); - var translator = new Translator(AuthKey, new TranslatorOptions { sendPlatformInfo = true, appInfo = new AppInfo { AppName = "my-dotnet-test-app", AppVersion = "1.2.3" }, ClientFactory = () => new HttpClientAndDisposeFlag { - HttpClient = new HttpClient(mockHandler), DisposeClient = true, - } }); + var translator = new Translator( + AuthKey, + new TranslatorOptions { + sendPlatformInfo = true, + appInfo = new AppInfo { AppName = "my-dotnet-test-app", AppVersion = "1.2.3" }, + ClientFactory = () => + new HttpClientAndDisposeFlag { HttpClient = new HttpClient(mockHandler), DisposeClient = true, } + }); var usage = await translator.GetUsageAsync(); Assert.Single(mockHandler.requests); var userAgentHeader = mockHandler.requests[0].Headers.UserAgent; @@ -113,13 +133,17 @@ public async Task TestOptInUserAgentHeaderWithAppInfo() { Assert.Contains("my-dotnet-test-app/1.2.3", userAgentHeader.ToString()); } - [Fact] public async Task TestOptOutUserAgentHeaderWithAppInfo() { var mockHandler = getMockHandler("{\"character_count\": 180118,\"character_limit\": 1250000}"); - var translator = new Translator(AuthKey, new TranslatorOptions { sendPlatformInfo = false, appInfo = new AppInfo { AppName = "my-dotnet-test-app", AppVersion = "1.2.3" }, ClientFactory = () => new HttpClientAndDisposeFlag { - HttpClient = new HttpClient(mockHandler), DisposeClient = true, - } }); + var translator = new Translator( + AuthKey, + new TranslatorOptions { + sendPlatformInfo = false, + appInfo = new AppInfo { AppName = "my-dotnet-test-app", AppVersion = "1.2.3" }, + ClientFactory = () => + new HttpClientAndDisposeFlag { HttpClient = new HttpClient(mockHandler), DisposeClient = true, } + }); var usage = await translator.GetUsageAsync(); Assert.Single(mockHandler.requests); var userAgentHeader = mockHandler.requests[0].Headers.UserAgent; @@ -227,9 +251,7 @@ public async Task TestProxyUsage() { ServerUrl = ServerUrl, ClientFactory = () => { - var handler = new System.Net.Http.HttpClientHandler() { - Proxy = new System.Net.WebProxy(ProxyUrl), UseProxy = true, - }; + var handler = new HttpClientHandler() { Proxy = new WebProxy(ProxyUrl), UseProxy = true, }; return new HttpClientAndDisposeFlag { HttpClient = new HttpClient(handler), DisposeClient = true, diff --git a/DeepLTests/resources/example_document_template.docx b/DeepLTests/resources/example_document_template.docx new file mode 100644 index 0000000..7ddbf3d Binary files /dev/null and b/DeepLTests/resources/example_document_template.docx differ diff --git a/DeepLTests/resources/example_presentation_template.pptx b/DeepLTests/resources/example_presentation_template.pptx new file mode 100644 index 0000000..1e5f5f7 Binary files /dev/null and b/DeepLTests/resources/example_presentation_template.pptx differ diff --git a/DeepLTests/resources/example_zip_template.zip b/DeepLTests/resources/example_zip_template.zip new file mode 100644 index 0000000..f1ca638 Binary files /dev/null and b/DeepLTests/resources/example_zip_template.zip differ diff --git a/README.md b/README.md index 5daa66b..94c7256 100644 --- a/README.md +++ b/README.md @@ -216,6 +216,63 @@ application needs to execute these steps individually, you can instead use the f - `Formality`: same as in [Text translation options](#text-translation-options). - `GlossaryId`: same as in [Text translation options](#text-translation-options). +- `EnableDocumentMinification`: A `bool` value. If set to `true`, the library will try to minify a document +before translating it through the API, sending a smaller document if the file contains a lot of media. This is +currently only supported for `pptx` and `docx` files. See also [Document minification](#document-minification). +Note that this only works in the high-level `TranslateDocumentDownloadAsync` method, not +`TranslateDocumentUploadAsync`. However, the behavior can be emulated by creating a new `DocumentMinifier` +object and calling the minifier's methods in between. + +#### Document minification +In some contexts, one can end up with large document files (e.g. PowerPoint presentations +or Word files with many contributors, especially in a larger organization). However, the +DeepL API enforces a limit of 30 MB for most of these files (see Usage Limits in the docs). +In the case that most of this size comes from media included in the documents (e.g. images, +videos, animations), document minification can help. +In this case, the library will create a temporary directory to extract the document into, +replace the large media with tiny placeholders, create a minified document, translate that +via the API, and re-insert the original media into the original file. Please note that this +requires a bit of additional (temporary) disk space, we recommend at least 2x the file size +of the document to be translated. +To use document minification, simply pass the option to the `TranslateDocumentAsync` function: +```c# +await translator.TranslateDocumentAsync( + inFile, outFile, "EN", "DE", new DocumentTranslateOptions { EnableDocumentMinification = true } +); +``` +In order to use document minification with the lower-level `TranslateDocumentUploadAsync`, +`TranslateDocumentWaitUntilDoneAsync` and `TranslateDocumentDownloadAsync` methods as well as other details, +see the `DocumentMinifier` class. +Currently supported document types for minification: +1. `pptx` +2. `docx` + Currently supported media types for minification: +1. `png` +2. `jpg` +3. `jpeg` +4. `emf` +5. `bmp` +6. `tiff` +7. `wdp` +8. `svg` +9. `gif` +10. `mp4` +11. `asf` +12. `avi` +13. `m4v` +14. `mpg` +15. `mpeg` +16. `wmv` +17. `mov` +18. `aiff` +19. `au` +20. `mid` +21. `midi` +22. `mp3` +23. `m4a` +24. `wav` +25. `wma` + ### Glossaries