Horizon – Rev 15

Subversion Repositories:
Rev:
using Horizon.Database;
using Horizon.Searching;
using Horizon.Snapshots;
using Serilog;
using System;
using System.Collections.Generic;
using System.Data.SQLite;
using System.Diagnostics;
using System.Drawing;
using System.IO;
using System.Linq;
using System.Runtime.CompilerServices;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
using System.Threading.Tasks;
using System.Threading.Tasks.Dataflow;
using Tesseract;
using TrackedFolders;

namespace Horizon
{
    public static class Extensions
    {
        private static readonly SemaphoreSlim _tesseractSemaphoreSlim = new SemaphoreSlim(1, 1);

        private static readonly Stopwatch _tesseractStopWatch = new Stopwatch();

        private static readonly TesseractEngine _tesseractEngine = new TesseractEngine(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "tessdata"), "eng", EngineMode.Default);

        /// <summary>
        /// https://stackoverflow.com/questions/7311734/split-sentence-into-words-but-having-trouble-with-the-punctuations-in-c-sharp
        /// </summary>
        private static readonly Regex _splitWordRegex = new Regex(@"((\b[^\s]+\b)((?<=\.\w).)?)", RegexOptions.Compiled);

        public static async Task TakeSnapshot(string path, TrackedFolders.TrackedFolders trackedFolders, SnapshotDatabase snapshotDatabase, SearchEngine searchEngine, [EnumeratorCancellation] CancellationToken cancellationToken)
        {
            foreach (var file in Directory.EnumerateFiles(path, "*.*", SearchOption.TopDirectoryOnly))
            {
                var fileName = Path.GetFileName(file);
                var directory = Path.GetDirectoryName(fileName);
                var color = Color.Empty;
                if (trackedFolders.TryGet(directory, out var folder))
                {
                    color = folder.Color;
                }

                try
                {
                    
                    if(await snapshotDatabase.CreateSnapshotAsync(fileName, file, color, cancellationToken) is Snapshot snapshot)
                    {
                        await searchEngine.Index(snapshot, cancellationToken);
                    }
                }
                catch (SQLiteException exception)
                {
                    if (exception.ResultCode == SQLiteErrorCode.Constraint)
                    {
                        Log.Information(exception, "Snapshot already exists.");
                    }
                }
                catch (Exception exception)
                {
                    Log.Error(exception, $"Could not take snapshot of file: {file}");
                }
            }
        }

        public static async Task TakeSnapshotRecursive(string path, TrackedFolders.TrackedFolders trackedFolders, SnapshotDatabase snapshotDatabase, SearchEngine searchEngine, [EnumeratorCancellation] CancellationToken cancellationToken)
        {
            foreach (var file in Directory.EnumerateFiles(path, "*.*", SearchOption.AllDirectories))
            {
                var fileName = Path.GetFileName(file);
                var directory = Path.GetDirectoryName(fileName);
                var color = Color.Empty;
                if (trackedFolders.TryGet(directory, out var folder))
                {
                    color = folder.Color;
                }

                try
                {

                    if (await snapshotDatabase.CreateSnapshotAsync(fileName, file, color, cancellationToken) is Snapshot snapshot)
                    {
                        await searchEngine.Index(snapshot, cancellationToken);
                    }
                }
                catch (SQLiteException exception)
                {
                    if (exception.ResultCode == SQLiteErrorCode.Constraint)
                    {
                        Log.Information(exception, "Snapshot already exists.");
                    }
                }
                catch (Exception exception)
                {
                    Log.Error(exception, $"Could not take snapshot of file: {file}");
                }
            }
        }

        public static async IAsyncEnumerable<string> RecognizeStrings(Bitmap screenCapture, [EnumeratorCancellation] CancellationToken cancellationToken)
        {
            var bufferBlock = new BufferBlock<string>(new DataflowBlockOptions { CancellationToken = cancellationToken, EnsureOrdered = false });
            // tesseract can only process a single image at once
            await _tesseractSemaphoreSlim.WaitAsync();
            try
            {
                // ocr image
                using var memoryStream = new MemoryStream();
                screenCapture.Save(memoryStream, System.Drawing.Imaging.ImageFormat.Bmp);
                memoryStream.Position = 0L;
                var imageData = memoryStream.ToArray();
                using var pix = Pix.LoadFromMemory(imageData);
                using var page = _tesseractEngine.Process(pix);
                _tesseractStopWatch.Start();
                var text = page.GetText();
                _tesseractStopWatch.Stop();
                var time = _tesseractStopWatch.Elapsed;

                Log.Information($"Tesseract OCR complete in {time}");

                foreach (var match in _splitWordRegex.Matches(text))
                {
                    await bufferBlock.SendAsync($"{match}", cancellationToken);
                }

                bufferBlock.Complete();
            }
            catch (Exception exception)
            {
                Log.Error(exception, $"Exception thrown while processing images with OCR");

                bufferBlock.Complete();
            }
            finally
            {
                _tesseractSemaphoreSlim.Release();
            }

            //await bufferBlock.Completion;
            while (await bufferBlock.OutputAvailableAsync())
            {
                if (bufferBlock.TryReceive(out var term))
                {
                    yield return term;
                }
            }
        }
    }
}