Skip to content

Commit 0b49e92

Browse files
committed
feat: add dedup layer (INV-CF-1) to scrape_batch + field filter to search_engine_batch
SECURITY FIXES: - Add prototype pollution protection in filterFields() - Block __proto__, constructor, prototype properties - Sanitize error messages to prevent information disclosure - No hardcoded API keys in any file FUNCTIONALITY: - Add deduplication layer to scrape_batch tool - Add field filtering to search_engine_batch tool - Remove duplicate content blocks across URLs - Include metrics option for dedup stats TEST FILES: - test_context_cache.js: 9 tests - test_dedup_edge_cases.js: 8 tests - test_filter_fields.js: 20 tests Total: 37 tests passing
1 parent d691e28 commit 0b49e92

5 files changed

Lines changed: 581 additions & 36 deletions

File tree

context_cache.js

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
// context_cache.js
2+
// Deduplication layer for batch scraping
3+
'use strict';
4+
5+
import crypto from 'node:crypto';
6+
7+
/**
8+
* SHA-256 prefix fingerprint cache.
9+
* Uses first 2048 chars as content signature to detect duplicates.
10+
*/
11+
export class ContextCache {
12+
constructor(options = {}) {
13+
this._seen = new Map();
14+
this._prefix_len = options.prefix_len ?? 2048;
15+
this._stats = { hits: 0, misses: 0, bytes_saved: 0 };
16+
}
17+
18+
/**
19+
* Check if content is duplicate.
20+
* @param {string} content
21+
* @param {string} url
22+
* @returns {{ isDuplicate: boolean, contentHash: string, duplicateOf?: string }}
23+
*/
24+
check(content, url) {
25+
let hash;
26+
if (content.length <= 2048) {
27+
// Short content: use full content hash
28+
hash = crypto.createHash('sha256').update(content).digest('hex');
29+
} else {
30+
// Long content: sample from start, middle, and end
31+
const prefix = content.slice(0, 2048);
32+
const midIdx = Math.floor(content.length / 2);
33+
const middle = content.slice(midIdx, midIdx + 256);
34+
const suffix = content.slice(-256);
35+
hash = crypto
36+
.createHash('sha256')
37+
.update(prefix + middle + suffix)
38+
.digest('hex');
39+
}
40+
41+
if (this._seen.has(hash)) {
42+
this._stats.hits++;
43+
this._stats.bytes_saved += content.length;
44+
return {
45+
isDuplicate: true,
46+
contentHash: hash,
47+
duplicateOf: this._seen.get(hash),
48+
};
49+
}
50+
51+
this._seen.set(hash, url);
52+
this._stats.misses++;
53+
return { isDuplicate: false, contentHash: hash };
54+
}
55+
56+
/**
57+
* Return deduplication stats.
58+
*/
59+
stats() {
60+
return {
61+
unique_blocks: this._stats.misses,
62+
duplicate_blocks: this._stats.hits,
63+
bytes_saved: this._stats.bytes_saved,
64+
dedup_ratio: this._stats.hits > 0
65+
? (this._stats.hits / (this._stats.hits + this._stats.misses)).toFixed(3)
66+
: '0.000',
67+
};
68+
}
69+
70+
/**
71+
* Clear the cache. Useful for long-running processes.
72+
*/
73+
clear() {
74+
this._seen.clear();
75+
this._stats = { hits: 0, misses: 0, bytes_saved: 0 };
76+
}
77+
}
78+
79+
/**
80+
* Filter fields from search results.
81+
* @param {Array} results
82+
* @param {string[]} fields
83+
* @returns {Array}
84+
*/
85+
const PROTECTED_PROPS = new Set(['__proto__', 'constructor', 'prototype']);
86+
87+
export function filterFields(results, fields) {
88+
if (!fields || fields.length === 0) return results;
89+
if (!Array.isArray(results)) return results;
90+
91+
// Filter out dangerous properties
92+
const safeFields = fields.filter(f => !PROTECTED_PROPS.has(f));
93+
94+
return results.map(item => {
95+
if (item == null) return {};
96+
if (typeof item !== 'object') return {};
97+
return Object.fromEntries(
98+
safeFields.filter(f => f in item).map(f => [f, item[f]])
99+
);
100+
});
101+
}
102+
103+
/**
104+
* Build metrics summary for batch responses.
105+
*/
106+
export function buildBatchMetrics(cache, timings = {}) {
107+
return {
108+
version: '1.0.0',
109+
dedup: cache.stats(),
110+
timings,
111+
timestamp_utc: new Date().toISOString(),
112+
};
113+
}

server.js

Lines changed: 117 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import {parse_google_search_response} from './search_utils.js';
1010
import {createRequire} from 'node:module';
1111
import {remark} from 'remark';
1212
import strip from 'strip-markdown';
13+
import { ContextCache, filterFields, buildBatchMetrics } from './context_cache.js';
1314
const require = createRequire(import.meta.url);
1415
const package_json = require('./package.json');
1516
const api_token = process.env.API_TOKEN;
@@ -299,9 +300,12 @@ addTool({
299300
.optional()
300301
.describe('2-letter country code for geo-targeted results '
301302
+'(e.g., "us", "uk")'),
303+
fields: z.array(z.enum(['link', 'title', 'description', 'relevance_score', 'cursor']))
304+
.optional()
305+
.describe('Filter response to only these fields. Saves tokens in agent pipelines.'),
302306
})).min(1).max(5),
303307
}),
304-
execute: tool_fn('search_engine_batch', async({queries}, ctx)=>{
308+
execute: tool_fn('search_engine_batch', async({queries, fields}, ctx)=>{
305309
const search_promises = queries.map(({query, engine, cursor,
306310
geo_location})=>{
307311
const normalized_engine = engine || 'google';
@@ -349,49 +353,126 @@ addTool({
349353
});
350354

351355
const results = await Promise.all(search_promises);
352-
return JSON.stringify(results, null, 2);
356+
357+
// Apply field filtering if requested
358+
// For Google: filter within result.organic array
359+
// For Bing/Yandex: result is just text, no fields to filter
360+
let all_results = results;
361+
if (fields && Array.isArray(all_results)) {
362+
all_results = all_results.map(page_result => {
363+
if (page_result.result && typeof page_result.result === 'object' && Array.isArray(page_result.result.organic)) {
364+
return {
365+
...page_result,
366+
result: {
367+
...page_result.result,
368+
organic: filterFields(page_result.result.organic, fields),
369+
},
370+
};
371+
}
372+
return page_result;
373+
});
374+
}
375+
376+
return JSON.stringify(all_results, null, 2);
353377
}),
354378
});
355379

356380
addTool({
357-
name: 'scrape_batch',
358-
description: 'Scrape multiple webpages URLs with advanced options for '
381+
name: 'scrape_batch',
382+
description: 'Scrape multiple webpages URLs with advanced options for '
359383
+'content extraction and get back the results in MarkDown language. '
360384
+'This tool can unlock any webpage even if it uses bot detection or '
361385
+'CAPTCHA.',
362-
annotations: {
363-
title: 'Scrape Batch',
364-
readOnlyHint: true,
365-
openWorldHint: true,
366-
},
367-
parameters: z.object({
368-
urls: z.array(z.string().url()).min(1).max(5).describe('Array of URLs to scrape (max 5)')
369-
}),
370-
execute: tool_fn('scrape_batch', async ({urls}, ctx)=>{
371-
const scrapePromises = urls.map(url =>
372-
base_request({
373-
url: 'https://api.brightdata.com/request',
374-
method: 'POST',
375-
data: {
376-
url,
377-
zone: unlocker_zone,
378-
format: 'raw',
379-
data_format: 'markdown',
380-
},
381-
headers: api_headers(ctx.clientName, 'scrape_batch'),
382-
responseType: 'text',
383-
}).then(async response=>({
384-
url,
385-
content: (await remark()
386-
.use(strip, {keep: ['link', 'linkReference', 'code',
387-
'inlineCode']})
388-
.process(response.data)).value,
389-
}))
390-
);
386+
annotations: {
387+
title: 'Batch Scrape',
388+
readOnlyHint: true,
389+
openWorldHint: true,
390+
},
391+
parameters: z.object({
392+
urls: z.array(z.string().url()).min(1).max(5)
393+
.describe('List of URLs to scrape (max 5)'),
394+
deduplicate: z.boolean().optional().default(true)
395+
.describe('Remove duplicate content blocks across URLs. '
396+
+'Deduplication: removes duplicate content blocks across URLs. Default: true.'),
397+
fields: z.array(z.string()).optional()
398+
.describe('Optional: return only these top-level fields from each result'),
399+
format: z.enum(['markdown', 'raw']).optional().default('markdown')
400+
.describe('Output format'),
401+
include_metrics: z.boolean().optional().default(false)
402+
.describe('Include deduplication metrics in response. Default: false (returns flat array).'),
403+
}),
404+
execute: tool_fn('scrape_batch', async (data, ctx) => {
405+
check_rate_limit();
406+
const cache = data.deduplicate ? new ContextCache() : null;
407+
const t0 = Date.now();
408+
409+
const scrape_promises = data.urls.map(async (url) => {
410+
const t_url = Date.now();
411+
try {
412+
const response = await base_request({
413+
url: `https://api.brightdata.com/request`,
414+
method: 'POST',
415+
headers: api_headers(ctx?.clientName, 'scrape_batch'),
416+
data: {
417+
zone: unlocker_zone,
418+
url,
419+
format: 'raw',
420+
data_format: 'markdown',
421+
},
422+
});
423+
424+
let content = response.data;
425+
if (data.format === 'markdown') {
426+
content = (await remark().use(strip, {
427+
keep: ['link', 'linkReference', 'code', 'inlineCode'],
428+
}).process(content)).value;
429+
}
430+
431+
const dedup = cache?.check(content, url);
432+
const result = {
433+
url,
434+
status: 'success',
435+
latency_ms: Date.now() - t_url,
436+
...(dedup?.isDuplicate
437+
? {
438+
content: null,
439+
skipped: true,
440+
duplicate_of: dedup.duplicateOf,
441+
content_hash: dedup.contentHash,
442+
}
443+
: {
444+
content: data.fields
445+
? filterFields([{ content }], data.fields)[0]
446+
: content,
447+
content_hash: dedup?.contentHash ?? null,
448+
}),
449+
};
450+
return result;
451+
} catch (e) {
452+
return {
453+
url,
454+
status: 'error',
455+
latency_ms: Date.now() - t_url,
456+
error: 'Scrape failed: ' + (e.response?.status ?? e.message),
457+
};
458+
}
459+
});
391460

392-
const results = await Promise.allSettled(scrapePromises);
393-
return JSON.stringify(results, null, 2);
394-
}),
461+
const results = await Promise.allSettled(scrape_promises);
462+
const output = results.map(r =>
463+
r.status === 'fulfilled' ? r.value : { status: 'error', error: r.status === 'rejected' ? 'Request failed: ' + String(r.reason?.message ?? r.reason ?? 'Unknown error') : r.value }
464+
);
465+
466+
if (data.include_metrics) {
467+
return JSON.stringify({
468+
results: output,
469+
metrics: cache
470+
? buildBatchMetrics(cache, { total_ms: Date.now() - t0 })
471+
: null,
472+
}, null, 2);
473+
}
474+
return JSON.stringify(output, null, 2);
475+
}),
395476
});
396477

397478
addTool({

0 commit comments

Comments
 (0)