Skip to content

Commit a62650a

Browse files
authored
Merge pull request #142 from brightdata/feat/search-dataset-tool
Add search_dataset and list_dataset_fields tools
2 parents d691e28 + 3dd8797 commit a62650a

5 files changed

Lines changed: 265 additions & 0 deletions

File tree

package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
"files": [
4040
"server.js",
4141
"search_utils.js",
42+
"search_dataset_schema.js",
4243
"browser_tools.js",
4344
"browser_session.js",
4445
"aria_snapshot_filter.js",

search_dataset_schema.js

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
'use strict'; /*jslint node:true es9:true*/
2+
import {z} from 'zod';
3+
4+
export const DATASET_IDS = [
5+
'gd_l1viktl72bvl7bjuj0',
6+
'gd_me5ppxjr2ge6icjuh0',
7+
'gd_l1vikfnt1wgvvqz95w',
8+
];
9+
10+
export const dataset_id_schema = z.enum(DATASET_IDS);
11+
12+
export const FILTER_OPERATORS = ['=', '!=', '<', '<=', '>', '>=', 'in',
13+
'not_in', 'includes', 'not_includes', 'array_includes',
14+
'not_array_includes', 'is_null', 'is_not_null'];
15+
16+
const leaf_value_schema = z.union([
17+
z.string(),
18+
z.number(),
19+
z.boolean(),
20+
z.array(z.union([z.string(), z.number(), z.boolean()])),
21+
]);
22+
23+
const leaf_schema = z.object({
24+
name: z.string().describe('Field name to filter on. Get valid field '
25+
+'names from the list_dataset_fields tool.'),
26+
operator: z.string().describe('Filter operator, one of: '
27+
+FILTER_OPERATORS.join(', ')),
28+
value: leaf_value_schema,
29+
});
30+
31+
function build_node_schema(depth){
32+
if (depth<=1)
33+
return leaf_schema;
34+
const group_schema = z.object({
35+
operator: z.enum(['and', 'or']),
36+
filters: z.array(build_node_schema(depth-1)).min(1),
37+
});
38+
return z.union([group_schema, leaf_schema]);
39+
}
40+
41+
const MAX_NESTING = 3;
42+
export const filter_schema = build_node_schema(MAX_NESTING+1);
43+
44+
export function metadata_to_fields(metadata){
45+
const fields = metadata && typeof metadata=='object'
46+
&& metadata.fields && typeof metadata.fields=='object'
47+
? metadata.fields : {};
48+
const out = [];
49+
for (const [name, meta] of Object.entries(fields))
50+
{
51+
if (!meta || typeof meta!='object')
52+
continue;
53+
if (meta.active===false)
54+
continue;
55+
out.push({
56+
name,
57+
type: meta.type,
58+
description: meta.description,
59+
});
60+
}
61+
return out;
62+
}

server.js

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ import {tools as browser_tools} from './browser_tools.js';
77
import prompts from './prompts.js';
88
import {GROUPS} from './tool_groups.js';
99
import {parse_google_search_response} from './search_utils.js';
10+
import {dataset_id_schema, filter_schema, metadata_to_fields, FILTER_OPERATORS}
11+
from './search_dataset_schema.js';
1012
import {createRequire} from 'node:module';
1113
import {remark} from 'remark';
1214
import strip from 'strip-markdown';
@@ -603,6 +605,93 @@ addTool({
603605
}),
604606
});
605607

608+
const SEARCHABLE_DATASETS_DESC = [
609+
'Supported dataset_id values:',
610+
'- gd_l1viktl72bvl7bjuj0: LinkedIn people profiles',
611+
'- gd_me5ppxjr2ge6icjuh0: LinkedIn people profiles (contact-enriched)',
612+
'- gd_l1vikfnt1wgvvqz95w: LinkedIn company information',
613+
].join('\n');
614+
615+
addTool({
616+
name: 'list_dataset_fields',
617+
description: 'List the filterable fields of a searchable dataset '
618+
+'(field name, type, and description). Call this before '
619+
+'search_dataset to learn which field names and types you can '
620+
+'filter on.\n'+SEARCHABLE_DATASETS_DESC,
621+
annotations: {
622+
title: 'List Dataset Fields',
623+
readOnlyHint: true,
624+
openWorldHint: true,
625+
},
626+
parameters: z.object({dataset_id: dataset_id_schema}),
627+
execute: tool_fn('list_dataset_fields', async({dataset_id}, ctx)=>{
628+
let response = await base_request({
629+
url: `https://api.brightdata.com/datasets/${dataset_id}`
630+
+`/metadata`,
631+
method: 'GET',
632+
headers: api_headers(ctx.clientName, 'list_dataset_fields'),
633+
});
634+
return JSON.stringify(metadata_to_fields(response.data));
635+
}),
636+
});
637+
638+
addTool({
639+
name: 'search_dataset',
640+
description: 'Search a Bright Data dataset by a filter and get matching '
641+
+'records back directly (fast Elasticsearch-backed search; no '
642+
+'snapshot). Use this to FIND MANY records by criteria, as opposed '
643+
+'to the web_data_* tools which fetch ONE record by URL.\n'
644+
+'First call list_dataset_fields to get valid field names.\n'
645+
+'A filter is a tree: a group {operator:"and"|"or", filters:[...]} '
646+
+'or a leaf {name, value, operator}. Max nesting depth 3.\n'
647+
+'Leaf operators: '+FILTER_OPERATORS.join(', ')+'.\n'
648+
+SEARCHABLE_DATASETS_DESC,
649+
annotations: {
650+
title: 'Search Dataset',
651+
readOnlyHint: true,
652+
openWorldHint: true,
653+
},
654+
parameters: z.object({
655+
dataset_id: dataset_id_schema,
656+
filter: filter_schema.describe('Filter tree describing which '
657+
+'records to match. Required, cannot be empty.'),
658+
size: z.number().int().positive().max(10).optional().default(10)
659+
.describe('Number of records to return (max 10, default 10)'),
660+
sort: z.union([
661+
z.enum(['default', 'random']),
662+
z.array(z.record(z.enum(['asc', 'desc']))),
663+
]).optional().describe('Sorting: "default", "random", or a custom '
664+
+'array like [{"timestamp":"asc"}]. Use "default" or custom '
665+
+'sort to paginate with search_after.'),
666+
search_after: z.array(z.any()).optional().describe('Pagination '
667+
+'cursor from a previous response\'s search_after value.'),
668+
}),
669+
execute: tool_fn('search_dataset', async({dataset_id, filter, size, sort,
670+
search_after}, ctx)=>
671+
{
672+
let body = {mode: 'sync', filter, size};
673+
if (sort!==undefined)
674+
body.sort = sort;
675+
if (search_after!==undefined)
676+
body.search_after = search_after;
677+
let response = await base_request({
678+
url: `https://api.brightdata.com/datasets/search/${dataset_id}`,
679+
method: 'POST',
680+
data: body,
681+
headers: {
682+
...api_headers(ctx.clientName, 'search_dataset'),
683+
'Content-Type': 'application/json',
684+
},
685+
});
686+
let {hits, total_hits, took, search_after: next_cursor}
687+
= response.data || {};
688+
let result = {hits, total_hits, took};
689+
if (next_cursor!==undefined)
690+
result.search_after = next_cursor;
691+
return JSON.stringify(result);
692+
}),
693+
});
694+
606695
addTool({
607696
name: 'session_stats',
608697
description: 'Tell the user about the tool usage during this session',

test/search-dataset-schema.test.js

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
'use strict'; /*jslint node:true es9:true*/
2+
import test from 'node:test';
3+
import assert from 'node:assert/strict';
4+
import {DATASET_IDS, dataset_id_schema, metadata_to_fields, FILTER_OPERATORS,
5+
filter_schema} from '../search_dataset_schema.js';
6+
7+
test('DATASET_IDS lists the three supported datasets', ()=>{
8+
assert.deepEqual(DATASET_IDS, [
9+
'gd_l1viktl72bvl7bjuj0',
10+
'gd_me5ppxjr2ge6icjuh0',
11+
'gd_l1vikfnt1wgvvqz95w',
12+
]);
13+
});
14+
15+
test('dataset_id_schema accepts a supported id', ()=>{
16+
assert.equal(dataset_id_schema.parse('gd_l1viktl72bvl7bjuj0'),
17+
'gd_l1viktl72bvl7bjuj0');
18+
});
19+
20+
test('dataset_id_schema rejects an unknown id', ()=>{
21+
assert.throws(()=>dataset_id_schema.parse('gd_not_a_real_dataset'));
22+
});
23+
24+
test('FILTER_OPERATORS lists the documented operators', ()=>{
25+
assert.deepEqual(FILTER_OPERATORS, [
26+
'=', '!=', '<', '<=', '>', '>=',
27+
'in', 'not_in',
28+
'includes', 'not_includes',
29+
'array_includes', 'not_array_includes',
30+
'is_null', 'is_not_null',
31+
]);
32+
});
33+
34+
test('metadata_to_fields keeps active fields as name/type/description', ()=>{
35+
const metadata = {
36+
id: 'gd_l1vijqt9jfj7olije',
37+
fields: {
38+
name: {type: 'text', active: true,
39+
description: 'The name of the company'},
40+
url: {type: 'url', required: true,
41+
description: 'The company URL'},
42+
cb_rank: {type: 'number', active: false,
43+
description: 'Crunchbase rank'},
44+
},
45+
};
46+
assert.deepEqual(metadata_to_fields(metadata), [
47+
{name: 'name', type: 'text', description: 'The name of the company'},
48+
{name: 'url', type: 'url', description: 'The company URL'},
49+
]);
50+
});
51+
52+
test('metadata_to_fields tolerates missing fields object', ()=>{
53+
assert.deepEqual(metadata_to_fields({id: 'x'}), []);
54+
assert.deepEqual(metadata_to_fields(null), []);
55+
});
56+
57+
test('metadata_to_fields skips non-object field entries', ()=>{
58+
assert.deepEqual(metadata_to_fields({fields: {bad: null,
59+
ok: {type: 'text', description: 'fine'}}}),
60+
[{name: 'ok', type: 'text', description: 'fine'}]);
61+
});
62+
63+
test('filter_schema accepts the documented flat example', ()=>{
64+
const filter = {operator: 'and', filters: [
65+
{name: 'name', value: 'Egor', operator: 'includes'},
66+
]};
67+
assert.deepEqual(filter_schema.parse(filter), filter);
68+
});
69+
70+
test('filter_schema accepts a single leaf node', ()=>{
71+
const filter = {name: 'cb_rank', value: 100, operator: '<'};
72+
assert.deepEqual(filter_schema.parse(filter), filter);
73+
});
74+
75+
test('filter_schema accepts array and boolean leaf values', ()=>{
76+
const filter = {operator: 'or', filters: [
77+
{name: 'tags', value: ['a', 'b'], operator: 'array_includes'},
78+
{name: 'verified', value: true, operator: '='},
79+
]};
80+
assert.deepEqual(filter_schema.parse(filter), filter);
81+
});
82+
83+
test('filter_schema accepts nesting up to depth 3', ()=>{
84+
const filter = {operator: 'and', filters: [
85+
{operator: 'or', filters: [
86+
{operator: 'and', filters: [
87+
{name: 'name', value: 'x', operator: 'includes'},
88+
]},
89+
]},
90+
]};
91+
assert.deepEqual(filter_schema.parse(filter), filter);
92+
});
93+
94+
test('filter_schema rejects nesting deeper than 3', ()=>{
95+
const filter = {operator: 'and', filters: [
96+
{operator: 'or', filters: [
97+
{operator: 'and', filters: [
98+
{operator: 'or', filters: [
99+
{name: 'name', value: 'x', operator: 'includes'},
100+
]},
101+
]},
102+
]},
103+
]};
104+
assert.throws(()=>filter_schema.parse(filter));
105+
});
106+
107+
test('filter_schema rejects an empty object', ()=>{
108+
assert.throws(()=>filter_schema.parse({}));
109+
});

tool_groups.js

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ export const GROUPS = {
3333
'web_data_linkedin_job_listings',
3434
'web_data_linkedin_posts',
3535
'web_data_linkedin_people_search',
36+
'list_dataset_fields',
37+
'search_dataset',
3638
'web_data_instagram_profiles',
3739
'web_data_instagram_posts',
3840
'web_data_instagram_reels',
@@ -95,6 +97,8 @@ export const GROUPS = {
9597
'web_data_google_maps_reviews',
9698
'web_data_zillow_properties_listing',
9799
'web_data_booking_hotel_listings',
100+
'list_dataset_fields',
101+
'search_dataset',
98102
],
99103
},
100104
RESEARCH: {

0 commit comments

Comments
 (0)