@@ -10,6 +10,7 @@ import {parse_google_search_response} from './search_utils.js';
1010import { createRequire } from 'node:module' ;
1111import { remark } from 'remark' ;
1212import strip from 'strip-markdown' ;
13+ import { ContextCache , filterFields , buildBatchMetrics } from './context_cache.js' ;
1314const require = createRequire ( import . meta. url ) ;
1415const package_json = require ( './package.json' ) ;
1516const api_token = process . env . API_TOKEN ;
@@ -299,9 +300,12 @@ addTool({
299300 . optional ( )
300301 . describe ( '2-letter country code for geo-targeted results '
301302 + '(e.g., "us", "uk")' ) ,
303+ fields : z . array ( z . enum ( [ 'link' , 'title' , 'description' , 'relevance_score' , 'cursor' ] ) )
304+ . optional ( )
305+ . describe ( 'Filter response to only these fields. Saves tokens in agent pipelines.' ) ,
302306 } ) ) . min ( 1 ) . max ( 5 ) ,
303307 } ) ,
304- execute : tool_fn ( 'search_engine_batch' , async ( { queries} , ctx ) => {
308+ execute : tool_fn ( 'search_engine_batch' , async ( { queries, fields } , ctx ) => {
305309 const search_promises = queries . map ( ( { query, engine, cursor,
306310 geo_location} ) => {
307311 const normalized_engine = engine || 'google' ;
@@ -349,49 +353,126 @@ addTool({
349353 } ) ;
350354
351355 const results = await Promise . all ( search_promises ) ;
352- return JSON . stringify ( results , null , 2 ) ;
356+
357+ // Apply field filtering if requested
358+ // For Google: filter within result.organic array
359+ // For Bing/Yandex: result is just text, no fields to filter
360+ let all_results = results ;
361+ if ( fields && Array . isArray ( all_results ) ) {
362+ all_results = all_results . map ( page_result => {
363+ if ( page_result . result && typeof page_result . result === 'object' && Array . isArray ( page_result . result . organic ) ) {
364+ return {
365+ ...page_result ,
366+ result : {
367+ ...page_result . result ,
368+ organic : filterFields ( page_result . result . organic , fields ) ,
369+ } ,
370+ } ;
371+ }
372+ return page_result ;
373+ } ) ;
374+ }
375+
376+ return JSON . stringify ( all_results , null , 2 ) ;
353377 } ) ,
354378} ) ;
355379
356380addTool ( {
357- name : 'scrape_batch' ,
358- description : 'Scrape multiple webpages URLs with advanced options for '
381+ name : 'scrape_batch' ,
382+ description : 'Scrape multiple webpages URLs with advanced options for '
359383 + 'content extraction and get back the results in MarkDown language. '
360384 + 'This tool can unlock any webpage even if it uses bot detection or '
361385 + 'CAPTCHA.' ,
362- annotations : {
363- title : 'Scrape Batch' ,
364- readOnlyHint : true ,
365- openWorldHint : true ,
366- } ,
367- parameters : z . object ( {
368- urls : z . array ( z . string ( ) . url ( ) ) . min ( 1 ) . max ( 5 ) . describe ( 'Array of URLs to scrape (max 5)' )
369- } ) ,
370- execute : tool_fn ( 'scrape_batch' , async ( { urls} , ctx ) => {
371- const scrapePromises = urls . map ( url =>
372- base_request ( {
373- url : 'https://api.brightdata.com/request' ,
374- method : 'POST' ,
375- data : {
376- url,
377- zone : unlocker_zone ,
378- format : 'raw' ,
379- data_format : 'markdown' ,
380- } ,
381- headers : api_headers ( ctx . clientName , 'scrape_batch' ) ,
382- responseType : 'text' ,
383- } ) . then ( async response => ( {
384- url,
385- content : ( await remark ( )
386- . use ( strip , { keep : [ 'link' , 'linkReference' , 'code' ,
387- 'inlineCode' ] } )
388- . process ( response . data ) ) . value ,
389- } ) )
390- ) ;
386+ annotations : {
387+ title : 'Batch Scrape' ,
388+ readOnlyHint : true ,
389+ openWorldHint : true ,
390+ } ,
391+ parameters : z . object ( {
392+ urls : z . array ( z . string ( ) . url ( ) ) . min ( 1 ) . max ( 5 )
393+ . describe ( 'List of URLs to scrape (max 5)' ) ,
394+ deduplicate : z . boolean ( ) . optional ( ) . default ( true )
395+ . describe ( 'Remove duplicate content blocks across URLs. '
396+ + 'Deduplication: removes duplicate content blocks across URLs. Default: true.' ) ,
397+ fields : z . array ( z . string ( ) ) . optional ( )
398+ . describe ( 'Optional: return only these top-level fields from each result' ) ,
399+ format : z . enum ( [ 'markdown' , 'raw' ] ) . optional ( ) . default ( 'markdown' )
400+ . describe ( 'Output format' ) ,
401+ include_metrics : z . boolean ( ) . optional ( ) . default ( false )
402+ . describe ( 'Include deduplication metrics in response. Default: false (returns flat array).' ) ,
403+ } ) ,
404+ execute : tool_fn ( 'scrape_batch' , async ( data , ctx ) => {
405+ check_rate_limit ( ) ;
406+ const cache = data . deduplicate ? new ContextCache ( ) : null ;
407+ const t0 = Date . now ( ) ;
408+
409+ const scrape_promises = data . urls . map ( async ( url ) => {
410+ const t_url = Date . now ( ) ;
411+ try {
412+ const response = await base_request ( {
413+ url : `https://api.brightdata.com/request` ,
414+ method : 'POST' ,
415+ headers : api_headers ( ctx ?. clientName , 'scrape_batch' ) ,
416+ data : {
417+ zone : unlocker_zone ,
418+ url,
419+ format : 'raw' ,
420+ data_format : 'markdown' ,
421+ } ,
422+ } ) ;
423+
424+ let content = response . data ;
425+ if ( data . format === 'markdown' ) {
426+ content = ( await remark ( ) . use ( strip , {
427+ keep : [ 'link' , 'linkReference' , 'code' , 'inlineCode' ] ,
428+ } ) . process ( content ) ) . value ;
429+ }
430+
431+ const dedup = cache ?. check ( content , url ) ;
432+ const result = {
433+ url,
434+ status : 'success' ,
435+ latency_ms : Date . now ( ) - t_url ,
436+ ...( dedup ?. isDuplicate
437+ ? {
438+ content : null ,
439+ skipped : true ,
440+ duplicate_of : dedup . duplicateOf ,
441+ content_hash : dedup . contentHash ,
442+ }
443+ : {
444+ content : data . fields
445+ ? filterFields ( [ { content } ] , data . fields ) [ 0 ]
446+ : content ,
447+ content_hash : dedup ?. contentHash ?? null ,
448+ } ) ,
449+ } ;
450+ return result ;
451+ } catch ( e ) {
452+ return {
453+ url,
454+ status : 'error' ,
455+ latency_ms : Date . now ( ) - t_url ,
456+ error : 'Scrape failed: ' + ( e . response ?. status ?? e . message ) ,
457+ } ;
458+ }
459+ } ) ;
391460
392- const results = await Promise . allSettled ( scrapePromises ) ;
393- return JSON . stringify ( results , null , 2 ) ;
394- } ) ,
461+ const results = await Promise . allSettled ( scrape_promises ) ;
462+ const output = results . map ( r =>
463+ r . status === 'fulfilled' ? r . value : { status : 'error' , error : r . status === 'rejected' ? 'Request failed: ' + String ( r . reason ?. message ?? r . reason ?? 'Unknown error' ) : r . value }
464+ ) ;
465+
466+ if ( data . include_metrics ) {
467+ return JSON . stringify ( {
468+ results : output ,
469+ metrics : cache
470+ ? buildBatchMetrics ( cache , { total_ms : Date . now ( ) - t0 } )
471+ : null ,
472+ } , null , 2 ) ;
473+ }
474+ return JSON . stringify ( output , null , 2 ) ;
475+ } ) ,
395476} ) ;
396477
397478addTool ( {
0 commit comments