Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion clis/jianyu/search.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ describe('jianyu search helpers', () => {
const filtered = __test__.filterNavigationRows('电梯', [
{ title: '招标公告', url: 'https://www.jianyu360.cn/list/stype/ZBGG.html', date: '' },
{ title: '帮助中心', url: 'https://www.jianyu360.cn/helpCenter/index', date: '' },
{ title: '某项目电梯采购公告', url: 'https://www.jianyu360.cn/notice/detail/123', date: '2026-04-07' },
{ title: '某项目电梯采购公告', url: 'https://shandong.jianyu360.cn/jybx/20260407_123.html', date: '2026-04-07' },
]);
expect(filtered).toHaveLength(1);
expect(filtered[0].title).toContain('电梯采购公告');
Expand Down Expand Up @@ -137,4 +137,23 @@ describe('jianyu search helpers', () => {
expect(result.rows[0].title).toContain('电梯采购公告');
expect(result.rows[1].title).toContain('另一条电梯采购公告');
});

it('classifies nologin links as blocked detail targets', () => {
const signal = __test__.classifyDetailStatus('https://www.jianyu360.cn/nologin/content/ABC.html');
expect(signal.detail_status).toBe('blocked');
});

it('extracts stable notice id from jybx urls', () => {
const id = __test__.extractNoticeId('https://shandong.jianyu360.cn/jybx/20260310_26030938267551.html');
expect(id).toBe('20260310_26030938267551');
});

it('keeps only rows inside recency window', () => {
const within = __test__.isWithinSinceDays('2026-03-20', 30, new Date('2026-04-09T00:00:00Z'));
const stale = __test__.isWithinSinceDays('2026-02-01', 30, new Date('2026-04-09T00:00:00Z'));
const missing = __test__.isWithinSinceDays('', 30, new Date('2026-04-09T00:00:00Z'));
expect(within).toBe(true);
expect(stale).toBe(false);
expect(missing).toBe(false);
});
});
148 changes: 145 additions & 3 deletions clis/jianyu/search.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,14 @@ const NAVIGATION_PATH_PREFIXES = [
'/exhibition/',
'/swordfish/page_big_pc/search/',
];
const BLOCKED_DETAIL_PATH_PREFIXES = [
'/nologin/content/',
'/article/bdprivate/',
];
const JIANYU_API_TYPES = ['fType', 'eType', 'vType', 'mType'] as const;

type DetailStatus = 'ok' | 'blocked' | 'entry_only';

interface JianyuApiPayload {
antiVerify?: number;
error_code?: number;
Expand Down Expand Up @@ -103,6 +109,94 @@ function isLikelyNavigationUrl(rawUrl: string): boolean {
}
}

function classifyDetailStatus(rawUrl: string): { detail_status: DetailStatus; detail_reason: string } {
const urlText = cleanText(rawUrl);
if (!urlText) {
return {
detail_status: 'blocked',
detail_reason: 'missing_url',
};
}

try {
const parsed = new URL(urlText);
const path = cleanText(parsed.pathname).toLowerCase().replace(/\/+$/, '/') || '/';
if (path.includes('/jybx/')) {
return {
detail_status: 'ok',
detail_reason: 'jybx_detail',
};
}
if (BLOCKED_DETAIL_PATH_PREFIXES.some((prefix) => path.includes(prefix))) {
return {
detail_status: 'blocked',
detail_reason: 'verification_or_paid_wall',
};
}
if (isLikelyNavigationUrl(urlText)) {
return {
detail_status: 'entry_only',
detail_reason: 'navigation_or_profile_entry',
};
}
return {
detail_status: 'entry_only',
detail_reason: 'non_jybx_entry',
};
} catch {
return {
detail_status: 'blocked',
detail_reason: 'invalid_url',
};
}
}

function extractNoticeId(rawUrl: string): string {
const value = cleanText(rawUrl);
if (!value) return '';
try {
const parsed = new URL(value);
const path = cleanText(parsed.pathname);
const jybxMatched = path.match(/\/jybx\/([^/?#]+)\.html$/i);
if (jybxMatched?.[1]) return cleanText(jybxMatched[1]);
const segments = path.split('/').filter(Boolean);
const tail = cleanText(segments[segments.length - 1] || '');
return cleanText(tail.replace(/\.html?$/i, ''));
} catch {
return '';
}
}

function isWithinSinceDays(
dateText: string,
sinceDays: number,
now: Date = new Date(),
): boolean {
const normalized = normalizeDate(dateText);
if (!normalized) return false;
const timestamp = Date.parse(`${normalized}T00:00:00Z`);
if (!Number.isFinite(timestamp)) return false;
const today = Date.UTC(now.getUTCFullYear(), now.getUTCMonth(), now.getUTCDate());
const deltaDays = Math.floor((today - timestamp) / (24 * 3600 * 1000));
return deltaDays >= 0 && deltaDays <= sinceDays;
}

function dedupeByNoticeKey<T extends { source_id?: string; notice_id?: string; title: string; url: string }>(items: T[]): T[] {
const deduped: T[] = [];
const seen = new Set<string>();
for (const item of items) {
const source = cleanText(item.source_id || '');
const notice = cleanText(item.notice_id || '');
const key = source && notice
? `${source}\t${notice}`
: `${cleanText(item.title)}\t${cleanText(item.url)}`;
if (!key || seen.has(key)) continue;
seen.add(key);
deduped.push(item);
}
return deduped;
}

function filterNavigationRows(query: string, items: Array<{
title?: string;
url?: string;
Expand All @@ -124,6 +218,8 @@ function filterNavigationRows(query: string, items: Array<{
}))
.filter((item) => {
if (!item.title || !item.url) return false;
const detailSignal = classifyDetailStatus(item.url);
if (detailSignal.detail_status !== 'ok') return false;
const haystack = `${item.title} ${item.contextText}`.toLowerCase();
const hasQuery = queryTokens.length === 0 || queryTokens.some((token) => haystack.includes(token));
const hasProcurementHint = PROCUREMENT_TITLE_HINT.test(`${item.title} ${item.contextText}`);
Expand Down Expand Up @@ -491,11 +587,13 @@ cli({
args: [
{ name: 'query', required: true, positional: true, help: 'Search keyword, e.g. "procurement"' },
{ name: 'limit', type: 'int', default: 20, help: 'Number of results (max 50)' },
{ name: 'since_days', type: 'int', default: 30, help: 'Only keep rows published within N days' },
],
columns: ['rank', 'content_type', 'title', 'publish_time', 'project_code', 'budget_or_limit', 'url'],
columns: ['rank', 'content_type', 'title', 'published_at', 'detail_status', 'project_code', 'budget_or_limit', 'url'],
func: async (page, kwargs) => {
const query = cleanText(kwargs.query);
const limit = Math.max(1, Math.min(Number(kwargs.limit) || 20, 50));
const sinceDays = Math.max(1, Math.min(Number(kwargs.since_days) || 30, 3650));
const apiResult = await fetchJianyuApiRows(page, query, limit);
const mergedRows = dedupeCandidates(filterNavigationRows(query, apiResult.rows));

Expand All @@ -512,11 +610,31 @@ cli({
const indexedRows = await fetchDuckDuckGoIndexRows(query, limit);
const filteredIndexedRows = dedupeCandidates(filterNavigationRows(query, indexedRows));
if (filteredIndexedRows.length > 0) {
return toProcurementSearchRecords(filteredIndexedRows, {
const records = toProcurementSearchRecords(filteredIndexedRows, {
site: SITE,
query,
limit,
});
const enriched = dedupeByNoticeKey(records.map((row) => {
const detailSignal = classifyDetailStatus(row.url);
const publishedAt = normalizeDate(row.publish_time || row.date);
return {
...row,
source_id: SITE,
notice_id: extractNoticeId(row.url),
published_at: publishedAt,
detail_status: detailSignal.detail_status,
detail_reason: detailSignal.detail_reason,
};
}))
.filter((row) => row.detail_status === 'ok')
.filter((row) => isWithinSinceDays(row.published_at, sinceDays))
.slice(0, limit)
.map((row, index) => ({
...row,
rank: index + 1,
}));
return enriched;
}

if (apiResult.challenge || await isAuthRequired(page)) {
Expand All @@ -527,11 +645,31 @@ cli({
}
}

return toProcurementSearchRecords(rows, {
const records = toProcurementSearchRecords(rows, {
site: SITE,
query,
limit,
});
const enriched = dedupeByNoticeKey(records.map((row) => {
const detailSignal = classifyDetailStatus(row.url);
const publishedAt = normalizeDate(row.publish_time || row.date);
return {
...row,
source_id: SITE,
notice_id: extractNoticeId(row.url),
published_at: publishedAt,
detail_status: detailSignal.detail_status,
detail_reason: detailSignal.detail_reason,
};
}))
.filter((row) => row.detail_status === 'ok')
.filter((row) => isWithinSinceDays(row.published_at, sinceDays))
.slice(0, limit)
.map((row, index) => ({
...row,
rank: index + 1,
}));
return enriched;
},
});

Expand All @@ -547,4 +685,8 @@ export const __test__ = {
normalizeApiRow,
fetchJianyuApiRows,
collectApiRowsFromResponses,
classifyDetailStatus,
extractNoticeId,
isWithinSinceDays,
dedupeByNoticeKey,
};
14 changes: 14 additions & 0 deletions clis/jianyu/shared/procurement-detail.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -80,4 +80,18 @@ describe('procurement detail runner', () => {
})).rejects.toThrow('[taxonomy=extraction_drift]');
expect(attempts).toBe(1);
});

it('rejects captcha/verification pages as selector_drift', async () => {
const page = createPage(async () => ({
title: '验证码',
detailText: '请在下图依次点击:槨畽黛',
publishTime: '',
}));

await expect(runProcurementDetail(page as never, {
url: 'https://www.jianyu360.cn/nologin/content/ABC.html',
site: 'jianyu',
query: '电梯',
})).rejects.toThrow('[taxonomy=selector_drift]');
});
});
15 changes: 15 additions & 0 deletions clis/jianyu/shared/procurement-detail.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,13 @@ const RETRYABLE_DETAIL_ERROR_PATTERNS = [
/cannot find context with specified id/i,
/\[taxonomy=empty_result\]/i,
];
const DETAIL_AUTH_CHALLENGE_PATTERNS = [
/请在下图依次点击/i,
/验证码/i,
/请完成验证/i,
/验证登录/i,
/登录即可获得更多浏览权限/i,
];

function isRetryableDetailError(error: unknown): boolean {
const message = error instanceof Error
Expand Down Expand Up @@ -83,6 +90,14 @@ export async function runProcurementDetail(
const title = cleanText(row.title);
const detailText = cleanText(row.detailText);
const publishTime = cleanText(row.publishTime);
const authGateText = cleanText(`${title} ${detailText}`);
if (DETAIL_AUTH_CHALLENGE_PATTERNS.some((pattern) => pattern.test(authGateText))) {
throw taxonomyError('selector_drift', {
site,
command: 'detail',
detail: `detail page blocked by verification challenge: ${targetUrl}`,
});
}
if (!title && !detailText) {
throw taxonomyError('empty_result', {
site,
Expand Down
Loading