Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Regarding the issue of being unable to restore when the number of keys is very large #924

Open
cnjeffreyloo opened this issue Aug 12, 2024 · 0 comments

Comments

@cnjeffreyloo
Copy link

When I generate a large amount of data using FasterKV (with the number of keys exceeding 300 million), and then save a checkpoint and load it again, an error occurs, and I can no longer recover the original data. What could be the issue?

Here is the source code:

using FASTER.core;
using System.Diagnostics;


class Program
{
    const int DataSize = 300000000;
    //const int DataSize = 100000;
    
    static async Task Main()
    {
        CleanData();
        await CreateLargeData();
        await LoadDataAndVerify();
    }

    private static void CleanData()
    {
        var sw = Stopwatch.StartNew();
        Console.WriteLine("Cleaning data ...");
        if (Directory.Exists("./data"))
        {
            Directory.Delete("./data", true);
        }
        sw.Stop();
        Console.WriteLine($"  - Cleaning data done. (Elapsed: {sw.Elapsed.TotalSeconds:f2}s)");
    }

    private static FasterKV<long, long> CreateStore(bool tryRecoverLatest = false)
    {
        var log = Devices.CreateLogDevice("./data/hlog.log");
        var objlog = Devices.CreateLogDevice("./data/hlog.obj.log");
        var store = new FasterKV<long, long>(
            // 1L << 20, 
            1L << 18,
            new LogSettings { LogDevice = log, ObjectLogDevice = objlog }, 
            new CheckpointSettings{ CheckpointDir = "./data/checkpoint" },
            tryRecoverLatest: tryRecoverLatest);
        return store;
    }

    // Generate a large amount of data
    private static async Task CreateLargeData()
    {
        Console.WriteLine($"Creating large data: size:{DataSize} ...");

        // Create FasterKV instance
        var store = CreateStore();
        using var session = store.NewSession(new SimpleFunctions<long, long>());

        // Write data
        var sw = Stopwatch.StartNew();
        var lastShowProgress = 0;
        for (long i = 0; i < DataSize; i++)
        {
            session.Upsert(i, i);
            var totalSeconds = (int)sw.Elapsed.TotalSeconds;
            if (totalSeconds - lastShowProgress > 5)
            {
                // Display progress
                lastShowProgress = totalSeconds;
                var currentSpeed = i / sw.Elapsed.TotalSeconds;
                Console.WriteLine($"  - Creating progress: {i}/{DataSize} ({i * 100.0 / DataSize:f2}%) (Elapse: {totalSeconds}, Speed: {currentSpeed:f2} ops/s) ...");
            }
        }

        // Write checkpoint
        var writeCheckpointSw = Stopwatch.StartNew();
        Console.WriteLine("  - Writing checkpoint ...");
        await store.TakeFullCheckpointAsync(CheckpointType.Snapshot);
        await session.CompletePendingAsync();
        // store.Log.Flush(true);
        writeCheckpointSw.Stop();
        Console.WriteLine($"    - Writing checkpoint done. (Elapsed: {writeCheckpointSw.Elapsed.TotalSeconds:f2}s)");

        ShowStoreInfo(store);

        // Dispose resources
        session.Dispose();
        store.Dispose();

        var speed = DataSize / sw.Elapsed.TotalSeconds;
        Console.WriteLine($"  - All done. (Elapsed: {sw.Elapsed.TotalSeconds:f2}s, Speed: {speed:f2} ops/s)");
    }

    static async Task LoadDataAndVerify()
    {
        Console.WriteLine($"Loading data and verify: size:{DataSize} ...");

        var loadSw = Stopwatch.StartNew();
        var store = CreateStore(tryRecoverLatest: true);
        loadSw.Stop();
        Console.WriteLine($"  - Loading data done. (Elapsed: {loadSw.Elapsed.TotalSeconds:f2}s)");
        ShowStoreInfo(store);

        using var session = store.NewSession(new SimpleFunctions<long, long>());

        
        // Read & verify data
        var sw = Stopwatch.StartNew();
        var lastShowProgressTime = 0;
        var lastShowErrorTime = -100;
        var errorCount = 0;
        for (long i = 0; i < DataSize; i++)
        {
            var (status, output) = (await session.ReadAsync(i)).Complete();
            if (!status.Found)
            {
                if (sw.Elapsed.TotalSeconds - lastShowErrorTime > 5)
                {
                    lastShowErrorTime = (int)sw.Elapsed.TotalSeconds;
                    Console.WriteLine($"  - Error: key {i} not found. (Status: {status})  (ErrorCount: {errorCount})");
                }
                errorCount++;
                continue;
            }

            if (output != i)
            {
                if (sw.Elapsed.TotalSeconds - lastShowErrorTime > 5)
                {
                    lastShowErrorTime = (int)sw.Elapsed.TotalSeconds;
                    Console.WriteLine($"  - Error: key {i} value is {output}, but expect is {i}.  (ErrorCount: {errorCount})");
                }
                errorCount++;
            }
            
            var totalSeconds = (int)sw.Elapsed.TotalSeconds;
            if (totalSeconds - lastShowProgressTime > 5)
            {
                // Display progress
                lastShowProgressTime = totalSeconds;
                var currentSpeed = i / sw.Elapsed.TotalSeconds;
                Console.WriteLine($"  - Creating progress: {i}/{DataSize} ({i * 100.0 / DataSize:f2}%) (Elapse: {totalSeconds}, Speed: {currentSpeed:f2} ops/s) ...");
            }
        }

        // Write checkpoint
        await store.TakeFullCheckpointAsync(CheckpointType.Snapshot);
        await session.CompletePendingAsync();

        ShowStoreInfo(store);

        // Dispose resources
        session.Dispose();
        store.Dispose();

        var speed = DataSize / sw.Elapsed.TotalSeconds;
        Console.WriteLine($"  - All done. (ErrorCount: {errorCount}, Elapsed: {sw.Elapsed.TotalSeconds:f2}s, Speed: {speed:f2} ops/s)");
    }

    static void ShowStoreInfo(FasterKV<long, long> store)
    {
        var sw = Stopwatch.StartNew();

        // Get FasterKV store info
        var entryCount = store.EntryCount;
        var indexSize = store.IndexSize;
        var overflowBucketCount = store.OverflowBucketCount;
        var memorySizeBytes = store.Log.MemorySizeBytes;
        sw.Stop();
        Console.WriteLine(
            $"  - Store info: EntryCount: {entryCount}, IndexSize: {indexSize}, overflowBucketCount: {overflowBucketCount}, "
            + $"memorySizeBytes: {memorySizeBytes}, (Get info elapse: {sw.Elapsed.TotalSeconds:f2}s)"
        );

    }
}

Run result:

Cleaning data ...
  - Cleaning data done. (Elapsed: 2.66s)
Creating large data: size:300000000 ...
  - Creating progress: 8333810/300000000 (2.78%) (Elapse: 6, Speed: 1388968.26 ops/s) ...
   ...
  - Creating progress: 299706408/300000000 (99.90%) (Elapse: 2520, Speed: 118931.11 ops/s) ...
  - Writing checkpoint ...
    - Writing checkpoint done. (Elapsed: 76.84s)
  - Store info: EntryCount: 300000000, IndexSize: 262144, overflowBucketCount: 42710337, memorySizeBytes: 7247757312, (Get info elapse: 16.72s)
  - All done. (Elapsed: 2619.05s, Speed: 114545.54 ops/s)
Loading data and verify: size:300000000 ...
  - Loading data done. (Elapsed: 0.85s)
  - Store info: EntryCount: 1835008, IndexSize: 262144, overflowBucketCount: 42710337, memorySizeBytes: 67108864, (Get info elapse: 0.02s)
  - Error: key 0 not found. (Status: NotFound)  (ErrorCount: 0)
  - Error: key 8532049 not found. (Status: NotFound)  (ErrorCount: 8532049)
  ...
  - Error: key 299672862 not found. (Status: NotFound)  (ErrorCount: 299672862)
  - Store info: EntryCount: 1835008, IndexSize: 262144, overflowBucketCount: 42710337, memorySizeBytes: 67108864, (Get info elapse: 0.01s)
  - All done. (ErrorCount: 300000000, Elapsed: 175.54s, Speed: 1708998.12 ops/s)

Note: After restoring the data, the EntryCount is noticeably much lower, and all keys cannot be found. (If DataSize = 200000000, then it is normal, with no errors.)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant