diff --git a/src/util/state.ts b/src/util/state.ts index ed8fed26..7165b5c2 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -349,7 +349,8 @@ export class RedisDedupeIndex { commitToAllKey = false, ) { crawlId = crawlId || this.crawlId; - if (isDupe) { + // if not a dupe, add to unique size count + if (!isDupe) { await this.dedupeRedis.hincrby(`h:${crawlId}:counts`, "uniqueSize", size); if (commitToAllKey) { await this.dedupeRedis.hincrby(DUPE_ALL_COUNTS, "uniqueSize", size); diff --git a/tests/dedupe-basic.test.js b/tests/dedupe-basic.test.js index 91a57c2c..52e8240b 100644 --- a/tests/dedupe-basic.test.js +++ b/tests/dedupe-basic.test.js @@ -143,7 +143,7 @@ test("check revisit records written on duplicate crawl, same collection, no wacz numResponses = response; - await checkSizeStats(numResponses, "allcounts", 0, 180000); + await checkSizeStats(numResponses, "allcounts", 0, 10000); }); @@ -190,7 +190,7 @@ test("check revisit records written on duplicate crawl, different collections, w numResponses = response; - await checkSizeStats(numResponses, "allcounts", 1, 48400000); + await checkSizeStats(numResponses, "allcounts", 1, 27000); }); @@ -226,7 +226,7 @@ test("verify crawl with imported dupe index has same dupes as dedupe against ori // matches same number of revisits as original expect(revisit).toBe(numResponses); - await checkSizeStats(numResponses, "allcounts", 2, 48400000); + await checkSizeStats(numResponses, "allcounts", 2, 27000); }); test("test requires in datapackage.json of wacz deduped against previous crawl", () => {