Skip to content

Commit ef16b66

Browse files
committed
feat: add opt-in field filtering to scrape_batch + search_engine_batch
Optional `fields` param returns only the requested top-level fields, to save tokens in agent pipelines. Backward compatible: when omitted, output is unchanged. scrape_batch also isolates per-URL errors (a failed URL resolves to {url, error} instead of a rejected settlement).
1 parent d691e28 commit ef16b66

3 files changed

Lines changed: 206 additions & 22 deletions

File tree

field_filter.js

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
'use strict'; /*jslint node:true es9:true*/
2+
3+
// Opt-in response shaping for the batch tools. Given a list of result objects
4+
// and a list of field names, return only those top-level fields from each item
5+
// so agent pipelines don't pay tokens for data they didn't ask for. Keys that
6+
// would pollute the prototype are never copied, and non-object items collapse
7+
// to {} so the output array stays uniform.
8+
const PROTECTED_PROPS = new Set(['__proto__', 'constructor', 'prototype']);
9+
10+
export function filter_fields(results, fields){
11+
if (!fields || fields.length===0)
12+
return results;
13+
if (!Array.isArray(results))
14+
return results;
15+
const safe_fields = fields.filter(f=>!PROTECTED_PROPS.has(f));
16+
return results.map(item=>{
17+
if (item===null || typeof item!=='object')
18+
return {};
19+
return Object.fromEntries(safe_fields
20+
.filter(f=>Object.prototype.hasOwnProperty.call(item, f))
21+
.map(f=>[f, item[f]]));
22+
});
23+
}

server.js

Lines changed: 50 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import {parse_google_search_response} from './search_utils.js';
1010
import {createRequire} from 'node:module';
1111
import {remark} from 'remark';
1212
import strip from 'strip-markdown';
13+
import {filter_fields} from './field_filter.js';
1314
const require = createRequire(import.meta.url);
1415
const package_json = require('./package.json');
1516
const api_token = process.env.API_TOKEN;
@@ -300,8 +301,13 @@ addTool({
300301
.describe('2-letter country code for geo-targeted results '
301302
+'(e.g., "us", "uk")'),
302303
})).min(1).max(5),
304+
fields: z.array(z.enum(['link', 'title', 'description',
305+
'relevance_score', 'cursor']))
306+
.optional()
307+
.describe('Filter response to only these fields. '
308+
+'Saves tokens in agent pipelines.'),
303309
}),
304-
execute: tool_fn('search_engine_batch', async({queries}, ctx)=>{
310+
execute: tool_fn('search_engine_batch', async({queries, fields}, ctx)=>{
305311
const search_promises = queries.map(({query, engine, cursor,
306312
geo_location})=>{
307313
const normalized_engine = engine || 'google';
@@ -349,7 +355,20 @@ addTool({
349355
});
350356

351357
const results = await Promise.all(search_promises);
352-
return JSON.stringify(results, null, 2);
358+
if (!fields)
359+
return JSON.stringify(results, null, 2);
360+
const filtered = results.map(item=>{
361+
if (item && item.result && Array.isArray(item.result.organic))
362+
return {
363+
...item,
364+
result: {
365+
...item.result,
366+
organic: filter_fields(item.result.organic, fields),
367+
},
368+
};
369+
return item;
370+
});
371+
return JSON.stringify(filtered, null, 2);
353372
}),
354373
});
355374

@@ -365,29 +384,38 @@ addTool({
365384
openWorldHint: true,
366385
},
367386
parameters: z.object({
368-
urls: z.array(z.string().url()).min(1).max(5).describe('Array of URLs to scrape (max 5)')
387+
urls: z.array(z.string().url()).min(1).max(5)
388+
.describe('Array of URLs to scrape (max 5)'),
389+
fields: z.array(z.string())
390+
.optional()
391+
.describe('Optional: return only these fields from each result '
392+
+'(e.g. ["content"]).'),
369393
}),
370-
execute: tool_fn('scrape_batch', async ({urls}, ctx)=>{
371-
const scrapePromises = urls.map(url =>
372-
base_request({
373-
url: 'https://api.brightdata.com/request',
374-
method: 'POST',
375-
data: {
376-
url,
377-
zone: unlocker_zone,
378-
format: 'raw',
379-
data_format: 'markdown',
380-
},
381-
headers: api_headers(ctx.clientName, 'scrape_batch'),
382-
responseType: 'text',
383-
}).then(async response=>({
384-
url,
385-
content: (await remark()
394+
execute: tool_fn('scrape_batch', async ({urls, fields}, ctx)=>{
395+
const scrapePromises = urls.map(async url=>{
396+
try {
397+
const response = await base_request({
398+
url: 'https://api.brightdata.com/request',
399+
method: 'POST',
400+
data: {
401+
url,
402+
zone: unlocker_zone,
403+
format: 'raw',
404+
data_format: 'markdown',
405+
},
406+
headers: api_headers(ctx.clientName, 'scrape_batch'),
407+
responseType: 'text',
408+
});
409+
const content = (await remark()
386410
.use(strip, {keep: ['link', 'linkReference', 'code',
387411
'inlineCode']})
388-
.process(response.data)).value,
389-
}))
390-
);
412+
.process(response.data)).value;
413+
const result = {url, content};
414+
return fields ? filter_fields([result], fields)[0] : result;
415+
} catch(e){
416+
return {url, error: e instanceof Error ? e.message : String(e)};
417+
}
418+
});
391419

392420
const results = await Promise.allSettled(scrapePromises);
393421
return JSON.stringify(results, null, 2);

test/field-filter.test.js

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
'use strict'; /*jslint node:true es9:true*/
2+
import test from 'node:test';
3+
import assert from 'node:assert/strict';
4+
import {filter_fields} from '../field_filter.js';
5+
6+
// Empty and null inputs
7+
test('empty array returns empty array', ()=>{
8+
assert.deepEqual(filter_fields([], ['title', 'url']), []);
9+
});
10+
11+
test('null results returns null', ()=>{
12+
assert.equal(filter_fields(null, ['title']), null);
13+
});
14+
15+
test('undefined results returns undefined', ()=>{
16+
assert.equal(filter_fields(undefined, ['title']), undefined);
17+
});
18+
19+
// Empty field list returns the items untouched
20+
test('empty fields array returns original items', ()=>{
21+
const items = [{a: 1, b: 2}, {c: 3}];
22+
assert.deepEqual(filter_fields(items, []), items);
23+
});
24+
25+
test('null fields array returns original items', ()=>{
26+
const items = [{a: 1}];
27+
assert.deepEqual(filter_fields(items, null), items);
28+
});
29+
30+
// Null/undefined items collapse to {}
31+
test('null item in array returns empty object', ()=>{
32+
assert.deepEqual(
33+
filter_fields([{title: 'a'}, null, {title: 'b'}], ['title']),
34+
[{title: 'a'}, {}, {title: 'b'}]);
35+
});
36+
37+
test('undefined item in array returns empty object', ()=>{
38+
assert.deepEqual(
39+
filter_fields([{title: 'a'}, undefined, {title: 'b'}], ['title']),
40+
[{title: 'a'}, {}, {title: 'b'}]);
41+
});
42+
43+
test('null item with non-empty fields returns empty object', ()=>{
44+
assert.deepEqual(filter_fields([null], ['title']), [{}]);
45+
});
46+
47+
// Field selection
48+
test('select single field', ()=>{
49+
assert.deepEqual(
50+
filter_fields([{title: 'Hello', url: 'http://x.com', desc: 'Desc'}],
51+
['title']),
52+
[{title: 'Hello'}]);
53+
});
54+
55+
test('select multiple fields', ()=>{
56+
assert.deepEqual(
57+
filter_fields([{title: 'Hello', url: 'http://x.com', desc: 'Desc'}],
58+
['title', 'url']),
59+
[{title: 'Hello', url: 'http://x.com'}]);
60+
});
61+
62+
test('select fields that do not exist returns empty object', ()=>{
63+
assert.deepEqual(filter_fields([{title: 'Hello'}], ['url', 'desc']),
64+
[{}]);
65+
});
66+
67+
test('select fields from multiple items', ()=>{
68+
const items = [
69+
{title: 'A', url: 'http://a.com'},
70+
{title: 'B', url: 'http://b.com'},
71+
];
72+
assert.deepEqual(filter_fields(items, ['title']),
73+
[{title: 'A'}, {title: 'B'}]);
74+
});
75+
76+
// Field ordering follows the requested order
77+
test('fields are returned in specified order', ()=>{
78+
assert.deepEqual(filter_fields([{z: 1, a: 2, m: 3}], ['a', 'm', 'z']),
79+
[{a: 2, m: 3, z: 1}]);
80+
});
81+
82+
// Duplicate field names are deduplicated by the output object
83+
test('duplicate fields in list are deduplicated', ()=>{
84+
assert.deepEqual(filter_fields([{title: 'Hello'}], ['title', 'title']),
85+
[{title: 'Hello'}]);
86+
});
87+
88+
// Non-object items
89+
test('non-object item in array returns empty object', ()=>{
90+
assert.deepEqual(filter_fields([42, 'string', true], ['a']),
91+
[{}, {}, {}]);
92+
});
93+
94+
test('mixed object and non-object items', ()=>{
95+
assert.deepEqual(
96+
filter_fields([{title: 'A'}, 42, {title: 'B'}], ['title']),
97+
[{title: 'A'}, {}, {title: 'B'}]);
98+
});
99+
100+
// Large field list
101+
test('large field list is handled', ()=>{
102+
const fields = Array.from({length: 1000}, (_, i)=>`field${i}`);
103+
const r = filter_fields([{field0: 0, field500: 500, field999: 999}],
104+
fields);
105+
assert.deepEqual(r, [{field0: 0, field500: 500, field999: 999}]);
106+
});
107+
108+
// Special characters and numeric-looking field names
109+
test('fields with special chars', ()=>{
110+
assert.deepEqual(
111+
filter_fields([{'field-name': 1, 'field_name': 2, 'field.name': 3}],
112+
['field-name', 'field_name']),
113+
[{'field-name': 1, 'field_name': 2}]);
114+
});
115+
116+
test('numeric-looking field names', ()=>{
117+
assert.deepEqual(filter_fields([{'123': 'num', '0': 'zero'}], ['123', '0']),
118+
[{'123': 'num', '0': 'zero'}]);
119+
});
120+
121+
// Nested objects are kept as values (only top-level keys are selected)
122+
test('nested objects are preserved as values', ()=>{
123+
assert.deepEqual(
124+
filter_fields([{title: 'A', meta: {k: 'v'}}], ['title', 'meta']),
125+
[{title: 'A', meta: {k: 'v'}}]);
126+
});
127+
128+
// Prototype-pollution guard: protected keys are never copied even if requested
129+
test('protected prototype keys are never copied', ()=>{
130+
assert.deepEqual(
131+
filter_fields([{a: 1}], ['__proto__', 'constructor', 'prototype', 'a']),
132+
[{a: 1}]);
133+
});

0 commit comments

Comments
 (0)