Commit d72f557a by luoqi

feat(sync): PR2 — cohort batch + checkpoint(内存稳 + 进度可观测)

资源 + 续跑 2 件:
1. **Cohort batch** — 按 patient 分批 load+transform+assemble+write,
   每批跑完中间表出作用域 → V8 GC 释放,峰值内存从 5-10GB 降到 500MB-1GB
   14GB 机器全量跑稳,不再撞 PG panic 那种磁盘 / OOM。

2. **Per-batch checkpoint** — sync_logs.metadata JSONB 记 cohortDone /
   cohortTotal / lastBatchMs / lastBatchRssMb,Dashboard + 监控可观测;
   readCheckpointOffset 从同 syncLogId 读 cohortDone(为 PR3 --resume 留口)。

变更:
  prisma migration 20260528000001:syncLog 加 metadata JSONB 列
  prisma schema 同步 metadata Json? 字段
  ClickHouseSourceService:
    - listPatientPairs:DISTINCT (patient_id, brand) FROM fact_client_out
      ORDER BY patient_id,增量 cursor 同步过滤;返回 batch 的边界
    - loadTablesForCohort:跟 loadAllTables 同形态,SQL 注入
      (patient_id, brand) IN (tuples) 过滤,增量 cursor 仍生效;
      不做反向拉主档(本批 fact_client_out 已含本批所有 patient 主档)
    - injectCohortFilter:把 IN tuple 在原 SQL 的 WHERE 末 / ORDER BY 前插入
  ColdImportService.importDirectory:
    - 加 cohortBatchSize option(env PAC_COHORT_BATCH_SIZE 兜底,默认 5000)
    - 抽出 processCohort 私有方法(单 cohort 完整 load→transform→write 流程)
    - cohortBatchSize > 0 + sql_source → 分批 loop,每批结束更新 metadata
    - 否则 single-shot(向后兼容,文件源走此路径)
    - chunk + resolveCohortBatchSize 导出工具函数(给 PR4 + 测试用)
  CLI cold-import.cli.ts:
    - 加 --incremental / --cohort-batch=N / --no-cohort 参数
    - 启动日志打印分批配置

向后兼容:
  - 既有 importDirectory({dryRun, incremental}) 调用全不动
  - 文件源(manifest.tables[])仍走 single-shot
  - ClickHouse 源默认走 5000 cohort,可 --no-cohort 退回 single-shot

PR3 后续:
  - 加 --resume 用 readCheckpointOffset(stale running lock 需手动 abort 后才能用)
  - 加 cron 看门狗自动清 stale running
parent fcc2a9d6
-- sync_logs.metadata:cohort batch / assembler 进度 + 资源画像 + 自定义标签
-- 形态(JSONB,无强 schema):
-- { "cohortTotal": 26, "cohortDone": 5, "lastAssembler": "diagnosis", "rssPeakMb": 980, ... }
-- 用途:
-- 1. 进度可观测:监控 / dashboard 看跑到第几批
-- 2. checkpoint 续跑:崩了 / 手动 abort 后,重启 CLI 加 --resume 从 cohortDone+1 接着跑
-- 3. 资源画像:rssPeakMb 记录峰值内存,辅助调 batch size
ALTER TABLE "sync_logs" ADD COLUMN "metadata" JSONB;
...@@ -1331,6 +1331,11 @@ model SyncLog { ...@@ -1331,6 +1331,11 @@ model SyncLog {
startedAt DateTime @default(now()) @map("started_at") @db.Timestamptz(3) startedAt DateTime @default(now()) @map("started_at") @db.Timestamptz(3)
endedAt DateTime? @map("ended_at") @db.Timestamptz(3) endedAt DateTime? @map("ended_at") @db.Timestamptz(3)
/// cohort batch / assembler 进度 + 资源画像(JSONB,无强 schema)
/// 形态见 packages/types schema(可选,目前用 unknown);典型字段:
/// { cohortTotal, cohortDone, lastAssembler, rssPeakMb, batchTimings: [...] }
metadata Json?
host Host @relation(fields: [hostId], references: [id]) host Host @relation(fields: [hostId], references: [id])
transactions PatientTransaction[] transactions PatientTransaction[]
......
...@@ -20,14 +20,21 @@ interface CliArgs { ...@@ -20,14 +20,21 @@ interface CliArgs {
dir?: string; dir?: string;
dryRun: boolean; dryRun: boolean;
help: boolean; help: boolean;
incremental: boolean;
cohortBatchSize?: number | null; // null = 显式禁用分批,undefined = 用 env / 默认
} }
function parseArgs(argv: string[]): CliArgs { function parseArgs(argv: string[]): CliArgs {
const args: CliArgs = { dryRun: false, help: false }; const args: CliArgs = { dryRun: false, help: false, incremental: false };
for (const a of argv) { for (const a of argv) {
if (a === '--help' || a === '-h') args.help = true; if (a === '--help' || a === '-h') args.help = true;
else if (a === '--dry-run') args.dryRun = true; else if (a === '--dry-run') args.dryRun = true;
else if (a.startsWith('--dir=')) args.dir = a.slice('--dir='.length); else if (a === '--incremental') args.incremental = true;
else if (a === '--no-cohort') args.cohortBatchSize = 0; // 显式禁用分批,跑 single-shot
else if (a.startsWith('--cohort-batch=')) {
const n = parseInt(a.slice('--cohort-batch='.length), 10);
args.cohortBatchSize = Number.isFinite(n) && n >= 0 ? n : undefined;
} else if (a.startsWith('--dir=')) args.dir = a.slice('--dir='.length);
} }
return args; return args;
} }
...@@ -43,13 +50,18 @@ function printHelp() { ...@@ -43,13 +50,18 @@ function printHelp() {
' pnpm cold-import -- --dir=<dir> [--dry-run]', ' pnpm cold-import -- --dir=<dir> [--dry-run]',
'', '',
'Options:', 'Options:',
' --dir=<path> 必填,manifest.yaml 所在目录(相对 / 绝对路径)', ' --dir=<path> 必填,manifest.yaml 所在目录(相对 / 绝对路径)',
' --dry-run 只读 + 翻译预览,不写库', ' --dry-run 只读 + 翻译预览,不写库',
' --help, -h 显示本帮助', ' --incremental 增量模式(读上次 cursor,SQL 注 WHERE > cursor;首跑等价全量)',
' --cohort-batch=<N> 按 patient 分批跑(default 5000;env PAC_COHORT_BATCH_SIZE 兜底)',
' --no-cohort 显式禁用分批,跑 single-shot(文件源或调试场景)',
' --help, -h 显示本帮助',
'', '',
'Examples:', 'Examples:',
' pnpm cold-import -- --dir=./data/jvs-dw --dry-run', ' pnpm cold-import -- --dir=./data/jvs-dw --dry-run',
' pnpm cold-import -- --dir=./data/jvs-dw', ' pnpm cold-import -- --dir=./data/jvs-dw',
' pnpm cold-import -- --dir=./data/jvs-dw --cohort-batch=2000',
' pnpm cold-import -- --dir=./data/jvs-dw --incremental',
'', '',
]; ];
// eslint-disable-next-line no-console // eslint-disable-next-line no-console
...@@ -64,7 +76,10 @@ async function bootstrap() { ...@@ -64,7 +76,10 @@ async function bootstrap() {
} }
const logger = new Logger('cold-import:cli'); const logger = new Logger('cold-import:cli');
logger.log(`Starting cold-import CLI(dir=${args.dir}, dryRun=${args.dryRun})`); logger.log(
`Starting cold-import CLI(dir=${args.dir}, dryRun=${args.dryRun}, ` +
`incremental=${args.incremental}, cohortBatch=${args.cohortBatchSize ?? '(default/env)'})`,
);
const app = await NestFactory.createApplicationContext(AppModule, { const app = await NestFactory.createApplicationContext(AppModule, {
logger: ['log', 'warn', 'error'], logger: ['log', 'warn', 'error'],
...@@ -72,7 +87,11 @@ async function bootstrap() { ...@@ -72,7 +87,11 @@ async function bootstrap() {
try { try {
const svc = app.get(ColdImportService); const svc = app.get(ColdImportService);
const result = await svc.importDirectory(args.dir!, { dryRun: args.dryRun }); const result = await svc.importDirectory(args.dir!, {
dryRun: args.dryRun,
incremental: args.incremental,
cohortBatchSize: args.cohortBatchSize,
});
logger.log('─────────────────────────────────────────'); logger.log('─────────────────────────────────────────');
logger.log(`Result:`); logger.log(`Result:`);
......
...@@ -233,6 +233,151 @@ export class ClickHouseSourceService { ...@@ -233,6 +233,151 @@ export class ClickHouseSourceService {
return tables; return tables;
} }
/**
* 列出全部 patient (patient_id, brand) pair,作为 cohort batch 的 ORDER 边界。
*
* 用 fact_client_out 当 source of truth(它就是 patient 主档表);
* 增量模式 cursorAfter 用 last_visit_time 列(同 manifest.sql_source.incremental.per_query.fact_client_out)。
*
* ORDER BY patient_id ASC 保证 OFFSET LIMIT 分页稳定;
* 跨 batch 时同一 patient 不会拆开(LIMIT 5000 = 拿 5000 个 distinct patient)。
*/
async listPatientPairs(
source: ClickHouseSource,
incremental?: IncrementalConfig,
): Promise<Array<{ patient_id: string; brand: string }>> {
const conn = this.resolveConnection(source);
const client = createClient({
url: conn.url,
database: conn.database,
username: conn.username,
password: conn.password,
request_timeout: 60_000,
compression: { response: true, request: false },
});
try {
const cursorCfg = incremental?.perQuery['fact_client_out'];
const whereParts: string[] = [];
if (cursorCfg?.cursorValue) {
whereParts.push(
`${cursorCfg.cursorColumn} > '${cursorCfg.cursorValue.replace(/'/g, "''")}'`,
);
}
const whereSql = whereParts.length > 0 ? ` WHERE ${whereParts.join(' AND ')}` : '';
const sql = `SELECT DISTINCT patient_id, brand FROM dw_group.fact_client_out${whereSql} ORDER BY patient_id`;
this.logger.log(`[clickhouse·cohort] list patient pairs — ${sql.slice(0, 200)}`);
const started = Date.now();
const result = await client.query({ query: sql, format: 'JSONEachRow' });
const rows = (await result.json()) as Array<{ patient_id: string; brand: string }>;
this.logger.log(
`[clickhouse·cohort] → ${rows.length} distinct (patient_id, brand) pairs, ${Date.now() - started} ms`,
);
return rows;
} finally {
await client.close();
}
}
/**
* 加载指定 patient 集合的所有 source 表(cohort batch 模式入口)。
*
* 跟 loadAllTables 的差别:
* - 每张表 SQL 注入 `(patient_id, brand) IN ((p1,b1),(p2,b2),...)` 过滤
* - 增量 cursor 跟全量一样支持(WHERE cursor_col > val AND in (...) 二者并存)
* - 不做反向拉主档(本批 fact_client_out 已含本批所有 patient 主档,无需补)
* - default_limit 仍然应用(防御性,本批数据通常远小于 limit)
*
* 用途:importDirectory 在 cohort batch 模式下按 patient 段调用,
* 每批跑完释放内存,bound 资源使用。
*/
async loadTablesForCohort(
source: ClickHouseSource,
pairs: ReadonlyArray<{ patient_id: string; brand: string }>,
incremental?: IncrementalConfig,
): Promise<Record<string, unknown[]>> {
const conn = this.resolveConnection(source);
const client = createClient({
url: conn.url,
database: conn.database,
username: conn.username,
password: conn.password,
request_timeout: 60_000,
compression: { response: true, request: false },
});
// 构造 IN tuple 列表(同 reversePullPatientMaster 风格)
const tuples = pairs
.map(
(p) =>
`('${(p.patient_id ?? '').replace(/'/g, "''")}', '${(p.brand ?? '').replace(/'/g, "''")}')`,
)
.join(',');
const tables: Record<string, unknown[]> = {};
const defaultLimit = source.default_limit ?? 100_000;
try {
for (const [tableName, sql] of Object.entries(source.queries)) {
const incCfg = incremental?.perQuery[tableName];
// 1. cursor 注入(同 loadAllTables 路径)
const sqlWithCursor = incCfg
? this.injectIncrementalCursor(sql, incCfg.cursorColumn, incCfg.cursorValue)
: sql;
// 2. cohort 注入:在 WHERE 后追加 AND (patient_id, brand) IN (...)
const sqlWithCohort = this.injectCohortFilter(sqlWithCursor, tuples);
const sqlWithLimit = this.applyDefaultLimit(sqlWithCohort, defaultLimit);
this.logger.log(
`[clickhouse·cohort] query "${tableName}" — ${sqlWithLimit.slice(0, 140).replace(/\s+/g, ' ')}...`,
);
const started = Date.now();
const result = await client.query({ query: sqlWithLimit, format: 'JSONEachRow' });
const rows = (await result.json()) as unknown[];
tables[tableName] = rows;
this.logger.log(
`[clickhouse·cohort] "${tableName}" → ${rows.length} 行,${Date.now() - started} ms`,
);
// 增量 cursor 推进(同 loadAllTables;cohort 模式下也累积,run_start cursor 会覆盖)
if (incremental && incCfg && rows.length > 0) {
const maxVal = this.computeMax(rows as Record<string, unknown>[], incCfg.cursorColumn);
if (maxVal) {
incremental.cursorAdvances = incremental.cursorAdvances ?? {};
const prev = incremental.cursorAdvances[tableName];
if (!prev || maxVal > prev) {
incremental.cursorAdvances[tableName] = maxVal;
}
}
}
}
} finally {
await client.close();
}
return tables;
}
/// 把 (patient_id, brand) IN tuple 过滤注入到已有 SQL 的 WHERE 末尾(无 WHERE 时新建)
private injectCohortFilter(sql: string, tuplesCsv: string): string {
if (!tuplesCsv) return sql; // 空 cohort 不注入(理论上不该发生)
const cohortClause = `(patient_id, brand) IN (${tuplesCsv})`;
// 找最外层 WHERE(若有);若已有 ORDER BY/LIMIT/GROUP BY,要插在它们之前
const hasWhere = /\bWHERE\b/i.test(sql);
if (!hasWhere) {
// 在 ORDER/LIMIT/GROUP 之前 / FROM 之后插 WHERE
const tailMatch = sql.match(/(\s+(?:ORDER\s+BY|LIMIT|GROUP\s+BY)\b[\s\S]*)$/i);
if (tailMatch) {
const tail = tailMatch[1]!;
return sql.slice(0, sql.length - tail.length) + ` WHERE ${cohortClause}` + tail;
}
return `${sql} WHERE ${cohortClause}`;
}
// 已有 WHERE:在 ORDER/LIMIT/GROUP 之前 / WHERE body 末尾插 AND ...
const tailMatch = sql.match(/(\s+(?:ORDER\s+BY|LIMIT|GROUP\s+BY)\b[\s\S]*)$/i);
if (tailMatch) {
const tail = tailMatch[1]!;
return sql.slice(0, sql.length - tail.length) + ` AND ${cohortClause}` + tail;
}
return `${sql} AND ${cohortClause}`;
}
/// 把原 SQL 的 cohort/cursor/ORDER/LIMIT 全部剥离,改写为 patient_id+brand 精确过滤 /// 把原 SQL 的 cohort/cursor/ORDER/LIMIT 全部剥离,改写为 patient_id+brand 精确过滤
private injectPatientFilter( private injectPatientFilter(
originalSql: string, originalSql: string,
......
...@@ -58,7 +58,13 @@ export class ColdImportService { ...@@ -58,7 +58,13 @@ export class ColdImportService {
async importDirectory( async importDirectory(
dir: string, dir: string,
options: { dryRun?: boolean; incremental?: boolean } = {}, options: {
dryRun?: boolean;
incremental?: boolean;
/// 按 patient 分批跑(内存友好)。 null/undefined/0 → 单 shot(向后兼容);
/// 实际生效大小见 resolveCohortBatchSize(env PAC_COHORT_BATCH_SIZE 兜底,默认 5000)
cohortBatchSize?: number | null;
} = {},
): Promise<ImportRunResult> { ): Promise<ImportRunResult> {
const absDir = path.resolve(dir); const absDir = path.resolve(dir);
const runId = randomUUID(); const runId = randomUUID();
...@@ -170,80 +176,99 @@ export class ColdImportService { ...@@ -170,80 +176,99 @@ export class ColdImportService {
let status: string = SyncStatus.FAILED; let status: string = SyncStatus.FAILED;
try { try {
// 5. 一次性加载所有 raw tables(文件 / ClickHouse 二选一) // 5. 加载 assembler configs(一次,全 cohort batch 共用)
let tables = await this.loadAllTables(absDir, manifest, incrementalConfig);
// 5.5 Layer A.5 transforms — yaml 声明式形态改造(JSON 拆行 / 派生 / 路由等)
if (manifest.transforms && manifest.transforms.length > 0) {
this.logger.log(
`Layer A.5 transforms:${manifest.transforms.length} ,sources=${Object.keys(tables).length}`,
);
const transformInputs = tables as Record<string, Record<string, unknown>[]>;
tables = this.transformEngine.run({ tables: transformInputs, transforms: manifest.transforms });
this.logger.log(
`Layer A.5 done,tables 现有 ${Object.keys(tables).length} ` +
` ( transform 产出):${Object.entries(tables)
.map(([k, v]) => `${k}=${v.length}`)
.join(', ')}`,
);
}
// 6. 一次性读所有 assembler configs
const assemblerConfigs = this.loadAllAssemblers(absDir, manifest); const assemblerConfigs = this.loadAllAssemblers(absDir, manifest);
const patientCfg = assemblerConfigs.find((c) => c.canonical === 'patient'); const patientCfg = assemblerConfigs.find((c) => c.canonical === 'patient');
const subjectCfgs = assemblerConfigs.filter((c) => c.canonical !== 'patient'); const subjectCfgs = assemblerConfigs.filter((c) => c.canonical !== 'patient');
const normalize = { amountUnit: manifest.amount_unit, timezone: manifest.timezone }; const normalize = { amountUnit: manifest.amount_unit, timezone: manifest.timezone };
// 7. patients 先做(建立 patient_id 索引,供 subject 资源关联) // 6. 决定 cohort 模式:
if (patientCfg) { // - cohortBatchSize > 0 + ClickHouse 源 → 分批跑(内存友好,resume 友好)
const stats = await this.processPatients( // - 未设 / 文件源 → 单 shot(向后兼容,csv 模式数据量小不需分批)
tables, const cohortBatchSize = resolveCohortBatchSize(options.cohortBatchSize);
patientCfg, if (cohortBatchSize && manifest.sql_source) {
host.id, // ── cohort 模式 ──
tenantResolver, // 6a. 列出全部 patient pair(增量 cursor 同样适用)
seenTenants, const allPairs = await this.chSource.listPatientPairs(
normalize, manifest.sql_source,
options.dryRun ?? false, incrementalConfig,
); );
perResource.push(stats); if (allPairs.length === 0) {
totals.patientsUpserted += stats.patientsUpserted; this.logger.log(`Cohort: 0 patients in scope(增量空跑或源数据为空) skip batch`);
totals.failed += stats.failed; } else {
if (!firstError && stats.failed > 0) const batches = chunk(allPairs, cohortBatchSize);
firstError = `patient: ${stats.failed} rows failed`; const resumeFrom = await this.readCheckpointOffset(syncLog?.id);
} else { this.logger.log(
this.logger.warn( `Cohort 模式:patients=${allPairs.length} batchSize=${cohortBatchSize} batches=${batches.length}` +
`No assembler for canonical='patient';skipping patient master upsert`, (resumeFrom > 0 ? ` resumeFrom=${resumeFrom}` : ''),
);
}
// 8. 其他资源(transaction 合成 + parser 衍生 fact)
for (const cfg of subjectCfgs) {
try {
const stats = await this.processSubject(
tables,
cfg,
host.id,
tenantResolver,
seenTenants,
normalize,
syncLog?.id,
options.dryRun ?? false,
); );
perResource.push(stats); for (let i = resumeFrom; i < batches.length; i++) {
totals.transactionsWritten += stats.transactionsWritten; const batchStart = Date.now();
totals.duplicates += stats.duplicates; const batch = batches[i]!;
totals.failed += stats.failed; this.logger.log(
totals.factsCreated += stats.factsCreated; `─── cohort batch ${i + 1}/${batches.length} ` +
totals.factsSuperseded += stats.factsSuperseded; `(patients ${i * cohortBatchSize}..${i * cohortBatchSize + batch.length - 1}) ─────`,
totals.factsUnchanged += stats.factsUnchanged; );
totals.factsEvidenceAppended += stats.factsEvidenceAppended; await this.processCohort({
totals.factsFailed += stats.factsFailed; absDir,
} catch (err) { manifest,
const msg = err instanceof Error ? err.message : String(err); host,
firstError ||= `${cfg.canonical}: ${msg}`; tenantResolver,
this.logger.error(`Resource ${cfg.canonical} failed: ${msg}`); seenTenants,
normalize,
patientCfg,
subjectCfgs,
syncLogId: syncLog?.id,
dryRun: options.dryRun ?? false,
cohort: batch,
incrementalConfig,
perResource,
totals,
onFirstError: (msg) => { if (!firstError) firstError = msg; },
});
// 6b. checkpoint 写入(给 --resume + 进度可观测用)
if (syncLog) {
const elapsedMs = Date.now() - batchStart;
const memMb = Math.round(process.memoryUsage().rss / 1024 / 1024);
await this.prisma.syncLog.update({
where: { id: syncLog.id },
data: {
metadata: {
cohortTotal: batches.length,
cohortDone: i + 1,
batchSize: cohortBatchSize,
lastBatchMs: elapsedMs,
lastBatchRssMb: memMb,
lastBatchAt: new Date().toISOString(),
},
},
});
}
this.logger.log(
`─── cohort batch ${i + 1}/${batches.length} done ${Date.now() - batchStart}ms ` +
`rss=${Math.round(process.memoryUsage().rss / 1024 / 1024)}MB ─────`,
);
}
} }
} else {
// ── single-shot 模式(向后兼容 / 文件源)──
await this.processCohort({
absDir,
manifest,
host,
tenantResolver,
seenTenants,
normalize,
patientCfg,
subjectCfgs,
syncLogId: syncLog?.id,
dryRun: options.dryRun ?? false,
cohort: null,
incrementalConfig,
perResource,
totals,
onFirstError: (msg) => { if (!firstError) firstError = msg; },
});
} }
status = status =
...@@ -482,6 +507,125 @@ export class ColdImportService { ...@@ -482,6 +507,125 @@ export class ColdImportService {
} }
// ───────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────
// processCohort —— 单次 load → transform → assemble → write 循环
// 单 shot 模式:cohort=null,load 全表
// cohort 模式: cohort=patient pairs,load 仅这些 patient 的数据
//
// 每次调用结束,tables / transform 中间产物全部出作用域 → V8 GC 释放(降内存峰值)
// ─────────────────────────────────────────────────────────
private async processCohort(args: {
absDir: string;
manifest: ColdImportManifest;
host: { id: string; name: string };
tenantResolver: TenantResolver;
seenTenants: Set<string>;
normalize: { amountUnit: 'fen' | 'yuan'; timezone: string };
patientCfg: AssemblerConfig | undefined;
subjectCfgs: AssemblerConfig[];
syncLogId: string | undefined;
dryRun: boolean;
cohort: ReadonlyArray<{ patient_id: string; brand: string }> | null;
incrementalConfig: IncrementalConfig | undefined;
/// 累加进调用方;由调用方维护(跨 cohort 全 run 汇总)
perResource: PerResourceStats[];
totals: TotalsBlock;
onFirstError: (msg: string) => void;
}): Promise<void> {
// 1. Load raw tables(cohort 过滤 / 全表)
let tables: Record<string, unknown[]>;
if (args.cohort && args.manifest.sql_source) {
tables = await this.chSource.loadTablesForCohort(
args.manifest.sql_source,
args.cohort,
args.incrementalConfig,
);
} else {
tables = await this.loadAllTables(args.absDir, args.manifest, args.incrementalConfig);
}
// 2. Layer A.5 transforms
if (args.manifest.transforms && args.manifest.transforms.length > 0) {
this.logger.log(
`Layer A.5 transforms:${args.manifest.transforms.length} 步,sources=${Object.keys(tables).length}`,
);
const transformInputs = tables as Record<string, Record<string, unknown>[]>;
tables = this.transformEngine.run({
tables: transformInputs,
transforms: args.manifest.transforms,
});
this.logger.log(
`Layer A.5 done,tables 现有 ${Object.keys(tables).length} 张` +
` (含 transform 产出):${Object.entries(tables)
.map(([k, v]) => `${k}=${v.length}`)
.join(', ')}`,
);
}
// 3. patients 先做(建立 patient_id 索引,供 subject 资源关联)
if (args.patientCfg) {
const stats = await this.processPatients(
tables,
args.patientCfg,
args.host.id,
args.tenantResolver,
args.seenTenants,
args.normalize,
args.dryRun,
);
args.perResource.push(stats);
args.totals.patientsUpserted += stats.patientsUpserted;
args.totals.failed += stats.failed;
if (stats.failed > 0) args.onFirstError(`patient: ${stats.failed} rows failed`);
} else {
this.logger.warn(
`No assembler for canonical='patient';skipping patient master upsert`,
);
}
// 4. 其他资源(transaction 合成 + parser 衍生 fact)
for (const cfg of args.subjectCfgs) {
try {
const stats = await this.processSubject(
tables,
cfg,
args.host.id,
args.tenantResolver,
args.seenTenants,
args.normalize,
args.syncLogId,
args.dryRun,
);
args.perResource.push(stats);
args.totals.transactionsWritten += stats.transactionsWritten;
args.totals.duplicates += stats.duplicates;
args.totals.failed += stats.failed;
args.totals.factsCreated += stats.factsCreated;
args.totals.factsSuperseded += stats.factsSuperseded;
args.totals.factsUnchanged += stats.factsUnchanged;
args.totals.factsEvidenceAppended += stats.factsEvidenceAppended;
args.totals.factsFailed += stats.factsFailed;
} catch (err) {
const msg = err instanceof Error ? err.message : String(err);
args.onFirstError(`${cfg.canonical}: ${msg}`);
this.logger.error(`Resource ${cfg.canonical} failed: ${msg}`);
}
}
}
/// 读已存在 sync_log 的 metadata.cohortDone(用于 --resume 续跑)
/// 当前实现:syncLog 是本次新建的 running 行,metadata 为空 → 返回 0
/// 真正 resume 需 PR3 加 --resume CLI 参数 + 复用已有 running sync_log id(stale 锁释放后)
private async readCheckpointOffset(syncLogId: string | undefined): Promise<number> {
if (!syncLogId) return 0;
const log = await this.prisma.syncLog.findUnique({
where: { id: syncLogId },
select: { metadata: true },
});
const meta = log?.metadata as { cohortDone?: number } | null;
return meta?.cohortDone ?? 0;
}
// ─────────────────────────────────────────────────────────
// Patients 主档(用同一份 yaml 形态,但走 upsert 路径) // Patients 主档(用同一份 yaml 形态,但走 upsert 路径)
// ───────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────
...@@ -926,6 +1070,36 @@ export interface PerResourceStats extends TotalsBlock { ...@@ -926,6 +1070,36 @@ export interface PerResourceStats extends TotalsBlock {
} }
/** /**
* cohort batch size 解析:
* option 显式 > 0 → 用 option
* option 显式 0/null → 强制 single-shot(向后兼容,文件源也走此路径)
* option 未指定 → fallback env PAC_COHORT_BATCH_SIZE(默认 5000)
*
* 返回 0 = 不分批(single-shot);>0 = cohort 大小
*/
export function resolveCohortBatchSize(opt: number | null | undefined): number {
if (opt !== undefined && opt !== null) {
return opt > 0 ? Math.floor(opt) : 0;
}
const env = process.env.PAC_COHORT_BATCH_SIZE;
if (env !== undefined) {
const n = parseInt(env, 10);
return Number.isFinite(n) && n > 0 ? n : 0;
}
return 5000;
}
/** 把数组切成 size 大小的块(尾巴块可能 < size) */
export function chunk<T>(arr: ReadonlyArray<T>, size: number): T[][] {
if (size <= 0) return [arr.slice() as T[]];
const out: T[][] = [];
for (let i = 0; i < arr.length; i += size) {
out.push(arr.slice(i, i + size) as T[]);
}
return out;
}
/**
* 给 externalId 算确定性"假手机号"(demo 兜底,host 真给的 phone 走真值)。 * 给 externalId 算确定性"假手机号"(demo 兜底,host 真给的 phone 走真值)。
* 算法:138 + (externalId 转 number) mod 1e8,补 0 至 8 位 → 138 + 8 位 = 11 位 * 算法:138 + (externalId 转 number) mod 1e8,补 0 至 8 位 → 138 + 8 位 = 11 位
* 同 externalId 多次 import 出同号(便于 demo / debug)。 * 同 externalId 多次 import 出同号(便于 demo / debug)。
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment