@@ -1094,7 +1094,7 @@ func (d *Client) fetchDateTimeStats(ctx context.Context, tableName, columnName s
10941094
10951095func (d * Client ) fetchJSONStats (ctx context.Context , tableName , columnName string ) (* diff.JSONStatistics , error ) {
10961096 statsQuery := fmt .Sprintf (`
1097- SELECT
1097+ SELECT
10981098 COUNT(*) as count_val,
10991099 COUNTIF(%s IS NULL) as null_count
11001100 FROM %s` ,
@@ -1122,6 +1122,145 @@ func (d *Client) fetchJSONStats(ctx context.Context, tableName, columnName strin
11221122 return stats , nil
11231123}
11241124
1125+ // Estimates the cost of running a table diff operation without executing the queries.
1126+ func (d * Client ) EstimateTableDiffCost (ctx context.Context , tableName string , schemaOnly bool ) (* diff.TableDiffCostEstimate , error ) {
1127+ result := & diff.TableDiffCostEstimate {
1128+ TableName : tableName ,
1129+ Queries : make ([]* diff.QueryCostEstimate , 0 ),
1130+ Supported : true ,
1131+ }
1132+
1133+ // Parse table name to get dataset reference
1134+ tableComponents := strings .Split (tableName , "." )
1135+ var datasetRef string
1136+ var targetTable string
1137+
1138+ switch len (tableComponents ) {
1139+ case 2 :
1140+ datasetRef = fmt .Sprintf ("%s.%s" , d .config .ProjectID , tableComponents [0 ])
1141+ targetTable = tableComponents [1 ]
1142+ case 3 :
1143+ datasetRef = fmt .Sprintf ("%s.%s" , tableComponents [0 ], tableComponents [1 ])
1144+ targetTable = tableComponents [2 ]
1145+ default :
1146+ return nil , fmt .Errorf ("table name must be in dataset.table or project.dataset.table format, '%s' given" , tableName )
1147+ }
1148+
1149+ schemaQuery := buildSchemaQuery (datasetRef , targetTable )
1150+ result .Queries = append (result .Queries , & diff.QueryCostEstimate {
1151+ QueryType : "schema" ,
1152+ Query : truncateQuery (schemaQuery ),
1153+ BytesProcessed : 0 , // INFORMATION_SCHEMA queries are free
1154+ BytesBilled : 0 ,
1155+ })
1156+
1157+ if schemaOnly {
1158+ // In schema-only mode, we only run the schema query
1159+ return result , nil
1160+ }
1161+
1162+ // 2. Row count query - dry run to estimate cost
1163+ countQuery := fmt .Sprintf ("SELECT COUNT(*) as row_count FROM `%s`" , tableName )
1164+ countEstimate , err := d .estimateQueryCost (ctx , countQuery , "rowCount" )
1165+ if err != nil {
1166+ return nil , fmt .Errorf ("failed to estimate row count query cost: %w" , err )
1167+ }
1168+ result .Queries = append (result .Queries , countEstimate )
1169+
1170+ // 3. Get schema to determine column types (this is free since we're querying INFORMATION_SCHEMA)
1171+ schemaResult , err := d .Select (ctx , & query.Query {Query : schemaQuery })
1172+ if err != nil {
1173+ return nil , fmt .Errorf ("failed to get schema for cost estimation: %w" , err )
1174+ }
1175+
1176+ // 4. For each column, estimate the statistics query cost
1177+ for _ , row := range schemaResult {
1178+ if len (row ) < 2 {
1179+ continue
1180+ }
1181+
1182+ columnName , ok := row [0 ].(string )
1183+ if ! ok {
1184+ continue
1185+ }
1186+
1187+ dataType , ok := row [1 ].(string )
1188+ if ! ok {
1189+ continue
1190+ }
1191+
1192+ normalizedType := d .typeMapper .MapType (dataType )
1193+ var statsQuery string
1194+
1195+ switch normalizedType {
1196+ case diff .CommonTypeNumeric :
1197+ statsQuery = fmt .Sprintf (`SELECT MIN(%s), MAX(%s), AVG(%s), SUM(%s), COUNT(%s), COUNTIF(%s IS NULL), STDDEV(%s) FROM %s` ,
1198+ columnName , columnName , columnName , columnName , columnName , columnName , columnName , "`" + tableName + "`" )
1199+ case diff .CommonTypeString :
1200+ statsQuery = fmt .Sprintf (`SELECT MIN(LENGTH(%s)), MAX(LENGTH(%s)), AVG(LENGTH(%s)), COUNT(DISTINCT %s), COUNT(*), COUNTIF(%s IS NULL), COUNTIF(%s = '') FROM %s` ,
1201+ columnName , columnName , columnName , columnName , columnName , columnName , "`" + tableName + "`" )
1202+ case diff .CommonTypeBoolean :
1203+ statsQuery = fmt .Sprintf (`SELECT COUNTIF(%s = true), COUNTIF(%s = false), COUNT(*), COUNTIF(%s IS NULL) FROM %s` ,
1204+ columnName , columnName , columnName , "`" + tableName + "`" )
1205+ case diff .CommonTypeDateTime :
1206+ statsQuery = fmt .Sprintf (`SELECT MIN(%s), MAX(%s), COUNT(DISTINCT %s), COUNT(*), COUNTIF(%s IS NULL) FROM %s` ,
1207+ columnName , columnName , columnName , columnName , "`" + tableName + "`" )
1208+ case diff .CommonTypeJSON :
1209+ statsQuery = fmt .Sprintf (`SELECT COUNT(*), COUNTIF(%s IS NULL) FROM %s` ,
1210+ columnName , "`" + tableName + "`" )
1211+ case diff .CommonTypeBinary , diff .CommonTypeUnknown :
1212+ // Skip binary and unknown types
1213+ continue
1214+ }
1215+
1216+ estimate , err := d .estimateQueryCost (ctx , statsQuery , "statistics:" + columnName )
1217+ if err != nil {
1218+ return nil , fmt .Errorf ("failed to estimate statistics query cost for column '%s': %w" , columnName , err )
1219+ }
1220+ result .Queries = append (result .Queries , estimate )
1221+ }
1222+
1223+ // Calculate totals
1224+ for _ , q := range result .Queries {
1225+ result .TotalBytesProcessed += q .BytesProcessed
1226+ result .TotalBytesBilled += q .BytesBilled
1227+ }
1228+
1229+ return result , nil
1230+ }
1231+
1232+ // estimateQueryCost runs a dry-run for a query and returns the bytes estimate.
1233+ func (d * Client ) estimateQueryCost (ctx context.Context , queryStr string , queryType string ) (* diff.QueryCostEstimate , error ) {
1234+ stats , err := d .QueryDryRun (ctx , & query.Query {Query : queryStr })
1235+ if err != nil {
1236+ return nil , err
1237+ }
1238+
1239+ bytesProcessed := stats .TotalBytesProcessed
1240+ // BigQuery has a minimum billing of 10 MB per query
1241+ bytesBilled := bytesProcessed
1242+ if bytesBilled < 10 * 1024 * 1024 && bytesBilled > 0 {
1243+ bytesBilled = 10 * 1024 * 1024
1244+ }
1245+
1246+ return & diff.QueryCostEstimate {
1247+ QueryType : queryType ,
1248+ Query : truncateQuery (queryStr ),
1249+ BytesProcessed : bytesProcessed ,
1250+ BytesBilled : bytesBilled ,
1251+ }, nil
1252+ }
1253+
1254+ // truncateQuery truncates a query string for display purposes.
1255+ func truncateQuery (q string ) string {
1256+ // Remove extra whitespace and newlines
1257+ q = strings .Join (strings .Fields (q ), " " )
1258+ if len (q ) > 100 {
1259+ return q [:97 ] + "..."
1260+ }
1261+ return q
1262+ }
1263+
11251264// tableMetadataResult holds the result of fetching table metadata.
11261265type tableMetadataResult struct {
11271266 Columns []* ansisql.DBColumn
0 commit comments