From 2176bfd2e73b6d7866f8e7ad37fb89e7639dc922 Mon Sep 17 00:00:00 2001 From: myfreeer Date: Sat, 16 Jul 2022 15:30:36 +0800 Subject: [PATCH] node-js: refactor link process and decrypt contents Recently the source website added some login walls, and some contents got encrypted after this. This code decrypts and recovers the correct content with no need to login. --- package-lock.json | 12 +- package.json | 2 + src/nodejs-cn/decrypt-contents.ts | 100 +++++++++++++++ src/nodejs-cn/fix-link.ts | 151 ++++++++++++++++++++++ src/nodejs-cn/life-cycle.ts | 207 +++++++----------------------- 5 files changed, 307 insertions(+), 165 deletions(-) create mode 100644 src/nodejs-cn/decrypt-contents.ts create mode 100644 src/nodejs-cn/fix-link.ts diff --git a/package-lock.json b/package-lock.json index 705fe8df..75467012 100644 --- a/package-lock.json +++ b/package-lock.json @@ -105,6 +105,11 @@ "@types/responselike": "*" } }, + "@types/crypto-js": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/@types/crypto-js/-/crypto-js-4.1.1.tgz", + "integrity": "sha512-BG7fQKZ689HIoc5h+6D2Dgq1fABRa0RbBWKBd9SP/MVRVXROflpm5fhwyATX5duFmbStzyzyycPB8qUYKDH3NA==" + }, "@types/http-cache-semantics": { "version": "4.0.1", "resolved": "https://registry.npmjs.org/@types/http-cache-semantics/-/http-cache-semantics-4.0.1.tgz", @@ -537,6 +542,11 @@ "which": "^2.0.1" } }, + "crypto-js": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/crypto-js/-/crypto-js-4.1.1.tgz", + "integrity": "sha512-o2JlM7ydqd3Qk9CA0L4NL6mTzU2sdx96a+oOfPu8Mkl/PK51vSyoi8/rQ8NknZtk44vq15lmhAj9CIAGwgeWKw==" + }, "css-select": { "version": "4.2.1", "resolved": "https://registry.npmjs.org/css-select/-/css-select-4.2.1.tgz", @@ -1435,7 +1445,7 @@ "pump": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz", - "integrity": "sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==", + "integrity": "sha1-tKIRaBW94vTh6mAjVOjHVWUQemQ=", "requires": { "end-of-stream": "^1.1.0", "once": "^1.3.1" diff --git a/package.json b/package.json index 9650edbc..0d7c00fc 100644 --- a/package.json +++ b/package.json @@ -7,7 +7,9 @@ "node": ">=12.16.0" }, "dependencies": { + "@types/crypto-js": "^4.1.1", "agentkeepalive": "^4.2.1", + "crypto-js": "^4.1.1", "mkdirp": "^1.0.4", "tough-cookie": "^4.0.0", "urijs": "^1.19.11", diff --git a/src/nodejs-cn/decrypt-contents.ts b/src/nodejs-cn/decrypt-contents.ts new file mode 100644 index 00000000..869d41cf --- /dev/null +++ b/src/nodejs-cn/decrypt-contents.ts @@ -0,0 +1,100 @@ +import AES from 'crypto-js/aes'; +import Utf8 from 'crypto-js/enc-utf8'; +import got from 'got'; +import {error as errorLogger} from 'website-scrap-engine/lib/logger/logger'; +import type {StaticDownloadOptions} from 'website-scrap-engine/lib/options'; +import type {CheerioStatic} from 'website-scrap-engine/lib/types'; + +// the original decrypt and restore function +// function fn({docs, data, decrypt, enc, loadingDoms}) { +// docs.push(...(data.split('-a0a-').map(doc => { +// return decrypt(doc, 'qQ1').toString(enc.Utf8); +// }))); +// let index = 0; +// for (let loadingDom of loadingDoms) { +// loadingDom.innerHTML = docs[index]; +// index++; +// } +// } + +const separator = '-a0a-'; +// the real key is computed inside crypto-js using some kdf +const key = 'qQ1'; + +const decryptOne = (input: string): string => + AES.decrypt(input, key).toString(Utf8); + +const decrypt = (input: string): string[] => + input.split(separator).map(decryptOne); + +const regexp = /\.html$/; + +// note this is base64 encoded text, not real ttf font +const replaceAs = '.ttf'; + +const fetchAndDecrypt = async ( + url: string, + options: StaticDownloadOptions +): Promise => { + const realContentUrl = url.replace(regexp, replaceAs); + if (realContentUrl === url) { + errorLogger.error('fetchAndDecrypt: bad url', url); + return; + } + + const theGot = options?.req ? got.extend(options.req) : got; + + const resp = await theGot(realContentUrl); + + const body = resp.body; + if (!body?.length) { + errorLogger.error('fetchAndDecrypt: empty body', realContentUrl, resp); + return; + } + return decrypt(body); +}; + +const asyncCache: Record> = {}; + + +const cachedFetchAndDecrypt = async ( + url: string, + options: StaticDownloadOptions +): Promise => { + if (asyncCache[url] !== undefined) { + return asyncCache[url]; + } + return asyncCache[url] = fetchAndDecrypt(url, options); +}; + +export async function decryptContent( + $: CheerioStatic, url: string, options: StaticDownloadOptions +): Promise { + // the original selector from code and the original var name + const loadingDoms = $('#apicontent .loading'); + if (!loadingDoms.length) { + return; + } + // the original var name + let docs: string[] | void; + try { + docs = await cachedFetchAndDecrypt(url, options); + } catch (e) { + errorLogger.error('cachedFetchAndDecrypt', url, e); + return; + } + if (!docs) { + errorLogger.warn('no content found', url); + return; + } + if (loadingDoms.length !== docs.length) { + errorLogger.warn('length mismatch', + url, loadingDoms.length, docs.length); + } + for (let i = 0; i < loadingDoms.length; i++) { + const decrypted = docs[i]; + if (decrypted) { + loadingDoms.eq(i).removeClass('loading').html(decrypted); + } + } +} diff --git a/src/nodejs-cn/fix-link.ts b/src/nodejs-cn/fix-link.ts new file mode 100644 index 00000000..08eaf241 --- /dev/null +++ b/src/nodejs-cn/fix-link.ts @@ -0,0 +1,151 @@ +import got from 'got'; +import type {StaticDownloadOptions} from 'website-scrap-engine/lib/options'; +import {error as errorLogger} from 'website-scrap-engine/lib/logger/logger'; + +const KW_ARR_BEGIN = 'var arr = [', + KW_ARR_END = '];', + KW_ARR_INDEX_BEGIN = 'location.replace(arr['; + +const HOST = 'nodejs.cn', + PROTOCOL = 'http', + URL_PREFIX = `${PROTOCOL}://${HOST}`; + +const LOCATION_REPLACE_LITERAL = 'location.replace(\'', + LOCATION_REPLACE_LITERAL_END = '\')'; + + +const gotNoRedirect = got.extend({ + followRedirect: false +}); + +export const cache: Record = {}; +const asyncRedirectCache: Record> = {}; + +const getRedirectLocation = async ( + link: string, + options: StaticDownloadOptions +): Promise => { + // make sure that followRedirect is false here + const theGot = options?.req ? got.extend(options.req, { + followRedirect: false + }) : gotNoRedirect; + const redirect = await theGot( + link.startsWith('/s') ? URL_PREFIX + link : link); + if (redirect.statusCode === 302 && redirect.headers?.location) { + cache[link] = redirect.headers.location; + link = redirect.headers.location; + } else if (redirect.body) { + /** + * @type string + */ + const html = redirect.body; + const arrBegin = html.indexOf(KW_ARR_BEGIN), + arrEnd = html.indexOf(KW_ARR_END, arrBegin), + arrIndex = html.indexOf(KW_ARR_INDEX_BEGIN, arrEnd); + if (arrBegin > 0 && arrEnd > 0 && arrIndex > 0) { + try { + const arr = JSON.parse(html.slice( + arrBegin + KW_ARR_BEGIN.length - 1, arrEnd + 1)); + const i = parseInt(html.slice( + arrIndex + KW_ARR_INDEX_BEGIN.length), 10); + if (arr && !isNaN(i) && arr[i]) { + cache[link] = arr[i]; + link = arr[i]; + } else { + errorLogger.warn('Can not parse redirect for', link, arr, i); + } + } catch (e) { + errorLogger.error('Error resolving redirect result', link, html, e); + } + } else { + // the new redirect page since 2021 + const literalBegin = html.indexOf(LOCATION_REPLACE_LITERAL), + literalEnd = literalBegin > 0 ? + html.indexOf(LOCATION_REPLACE_LITERAL_END, literalBegin) : -1; + if (literalBegin > 0 && literalEnd > 0) { + link = html.slice( + literalBegin + LOCATION_REPLACE_LITERAL.length, literalEnd); + } else { + errorLogger.warn('Unknown redirect result format', link, html); + } + } + } + // replace the api to required version + if (options?.meta?.nodeApiPath) { + link = link.replace(`${URL_PREFIX}/api/`, + `${URL_PREFIX}/${options.meta.nodeApiPath}/`); + } + return link; +}; + +export const cachedGetRedirectLocation = ( + link: string, options: StaticDownloadOptions +): string | Promise => { + if (cache[link]) { + return cache[link]; + } + if (asyncRedirectCache[link] !== undefined) { + return asyncRedirectCache[link]; + } + return asyncRedirectCache[link] = getRedirectLocation(link, options); +}; + +// the 404-not-found links +const hardCodedRedirectBuilder = (api: string): Record => ({ + [`/${api}/stream.md`]: `/${api}/stream.html`, + [`/${api}/http/net.html`]: `/${api}/net.html`, + [`/${api}/fs/stream.html`]: `/${api}/stream.html`, + [`/${api}/addons/n-api.html`]: `/${api}/n-api.html`, + [`/${api}/assert/tty.html`]: `/${api}/tty.html`, + [`/${api}/worker_threads/errors.html`]: `/${api}/errors.html`, + [`/${api}/process/cli.html`]: `/${api}/cli.html`, + [`/${api}/zlib/buffer.html`]: `/${api}/buffer.html`, + [`/${api}/dgram/errors.html`]: `/${api}/errors.html`, + [`/${api}/net/stream.html`]: `/${api}/stream.html`, + [`/${api}/process/stream.html`]: `/${api}/stream.html`, + [`/${api}/worker_threads/fs.html`]: `/${api}/fs.html`, + // 14.12.0 + [`/${api}/synopsis/cli.html`]: `/${api}/cli.html`, + // since 16.3.0 + [`/${api}/modules/esm.md`]: `/${api}/esm.html`, + // since 18.4.0 maybe for api-v14 and api-v16 + [`/${api}/esm.md`]: `/${api}/esm.html`, + +}); + +const hardCodedRedirectFullPathBuilder = (api: string): Record => ({ + // 14.9.0 + // http://nodejs.cn/api/module.html + [`http://${HOST}/${api}/modules_cjs.html#modules_cjs_the_module_wrapper`]: + `http://${HOST}/${api}/modules.html#modules_the_module_wrapper`, + // 14.9.0 + // http://nodejs.cn/api/module.html + [`http://${HOST}/${api}/modules_module.html#modules_module_class_module_sourcemap`]: + `http://${HOST}/${api}/module.html#module_class_module_sourcemap`, + // 14.9.0 + // http://nodejs.cn/api/module.html + [`http://${HOST}/${api}/modules/modules_module.html#modules_module_the_module_object`]: + `http://${HOST}/${api}/module.html#module_the_module_object`, + [`http://${HOST}/${api}/wiki.openssl.org/index.php/List_of_SSL_OP_Flags#Table_of_Options`]: + 'https://wiki.openssl.org/index.php/List_of_SSL_OP_Flags#Table_of_Options', + // 16.4.0 + [`/${api}/http_new_agent_options`]: + `http://${HOST}/${api}/http.html#http_new_agent_options` +}); + +export const hardCodedRedirect: Record = { + ...hardCodedRedirectBuilder('api'), + ...hardCodedRedirectBuilder('api-v16'), + ...hardCodedRedirectBuilder('api-v14'), +}; + +export const hardCodedRedirectFullPath: Record = { + ...hardCodedRedirectFullPathBuilder('api'), + ...hardCodedRedirectFullPathBuilder('api-v16'), + ...hardCodedRedirectFullPathBuilder('api-v14'), +}; + +export const initNodeApiPath = (api: string): void => { + Object.assign(hardCodedRedirect, hardCodedRedirectBuilder(api)); + Object.assign(hardCodedRedirectFullPath, hardCodedRedirectFullPathBuilder(api)); +}; diff --git a/src/nodejs-cn/life-cycle.ts b/src/nodejs-cn/life-cycle.ts index 921f8856..6df4007e 100644 --- a/src/nodejs-cn/life-cycle.ts +++ b/src/nodejs-cn/life-cycle.ts @@ -1,15 +1,12 @@ -import { join } from 'path'; +import {join} from 'path'; import URI from 'urijs'; -import got from 'got'; import type {Resource} from 'website-scrap-engine/lib/resource'; import {ResourceType} from 'website-scrap-engine/lib/resource'; import {error as errorLogger} from 'website-scrap-engine/lib/logger/logger'; -import type { - ProcessingLifeCycle, - ProcessResourceAfterDownloadFunc -} from 'website-scrap-engine/lib/life-cycle/types'; import type { DownloadResource, + ProcessingLifeCycle, + ProcessResourceAfterDownloadFunc, ProcessResourceBeforeDownloadFunc, SubmitResourceFunc } from 'website-scrap-engine/lib/life-cycle/types'; @@ -26,163 +23,23 @@ import { StaticDownloadOptions } from 'website-scrap-engine/lib/options'; import type {Cheerio, CheerioStatic} from 'website-scrap-engine/lib/types'; - -const gotNoRedirect = got.extend({ - followRedirect: false -}); - -const cache: Record = {}; -const asyncRedirectCache: Record> = {}; - -const KW_ARR_BEGIN = 'var arr = [', - KW_ARR_END = '];', - KW_ARR_INDEX_BEGIN = 'location.replace(arr['; +import type { + PipelineExecutor +} from 'website-scrap-engine/lib/life-cycle/pipeline-executor'; +import type {DownloaderWithMeta} from 'website-scrap-engine/lib/downloader/types'; +import {decryptContent} from './decrypt-contents'; +import { + cache, + cachedGetRedirectLocation, + hardCodedRedirect, + hardCodedRedirectFullPath, + initNodeApiPath +} from './fix-link'; const HOST = 'nodejs.cn', PROTOCOL = 'http', URL_PREFIX = `${PROTOCOL}://${HOST}`; -const LOCATION_REPLACE_LITERAL = 'location.replace(\'', - LOCATION_REPLACE_LITERAL_END = '\')'; - - -const getRedirectLocation = async ( - link: string, - options: StaticDownloadOptions -): Promise => { - // make sure that followRedirect is false here - const theGot = options?.req ? got.extend(options.req, { - followRedirect: false - }) : gotNoRedirect; - const redirect = await theGot( - link.startsWith('/s') ? URL_PREFIX + link : link); - if (redirect.statusCode === 302 && redirect.headers?.location) { - cache[link] = redirect.headers.location; - link = redirect.headers.location; - } else if (redirect.body) { - /** - * @type string - */ - const html = redirect.body; - const arrBegin = html.indexOf(KW_ARR_BEGIN), - arrEnd = html.indexOf(KW_ARR_END, arrBegin), - arrIndex = html.indexOf(KW_ARR_INDEX_BEGIN, arrEnd); - if (arrBegin > 0 && arrEnd > 0 && arrIndex > 0) { - try { - const arr = JSON.parse(html.slice( - arrBegin + KW_ARR_BEGIN.length - 1, arrEnd + 1)); - const i = parseInt(html.slice( - arrIndex + KW_ARR_INDEX_BEGIN.length), 10); - if (arr && !isNaN(i) && arr[i]) { - cache[link] = arr[i]; - link = arr[i]; - } else { - errorLogger.warn('Can not parse redirect for', link, arr, i); - } - } catch (e) { - errorLogger.error('Error resolving redirect result', link, html, e); - } - } else { - // the new redirect page since 2021 - const literalBegin = html.indexOf(LOCATION_REPLACE_LITERAL), - literalEnd = literalBegin > 0 ? - html.indexOf(LOCATION_REPLACE_LITERAL_END, literalBegin) : -1; - if (literalBegin > 0 && literalEnd > 0) { - link = html.slice( - literalBegin + LOCATION_REPLACE_LITERAL.length, literalEnd); - } else { - errorLogger.warn('Unknown redirect result format', link, html); - } - } - } - // replace the api to required version - if (options?.meta?.nodeApiPath) { - link = link.replace(`${URL_PREFIX}/api/`, - `${URL_PREFIX}/${options.meta.nodeApiPath}/`); - } - return link; -}; - -const cachedGetRedirectLocation = ( - link: string, options: StaticDownloadOptions -): string | Promise => { - if (cache[link]) { - return cache[link]; - } - if (asyncRedirectCache[link] !== undefined) { - return asyncRedirectCache[link]; - } - return asyncRedirectCache[link] = getRedirectLocation(link, options); -}; - -// the 404-not-found links -const hardCodedRedirect: Record = { - '/api/stream.md': '/api/stream.html', - '/api/http/net.html': '/api/net.html', - '/api/fs/stream.html': '/api/stream.html', - '/api/addons/n-api.html': '/api/n-api.html', - '/api/assert/tty.html': '/api/tty.html', - '/api/worker_threads/errors.html': '/api/errors.html', - '/api/process/cli.html': '/api/cli.html', - '/api/zlib/buffer.html': '/api/buffer.html', - '/api/dgram/errors.html': '/api/errors.html', - '/api/net/stream.html': '/api/stream.html', - '/api/process/stream.html': '/api/stream.html', - '/api/worker_threads/fs.html': '/api/fs.html', - // 14.12.0 - '/api/synopsis/cli.html': '/api/cli.html', - // since 16.3.0 - '/api/modules/esm.md': '/api/esm.html' -}; - -const hardCodedRedirectFullPath: Record = { - // 14.9.0 - // http://nodejs.cn/api/module.html - 'http://nodejs.cn/api/modules_cjs.html#modules_cjs_the_module_wrapper': - 'http://nodejs.cn/api/modules.html#modules_the_module_wrapper', - // 14.9.0 - // http://nodejs.cn/api/module.html - 'http://nodejs.cn/api/modules_module.html#modules_module_class_module_sourcemap': - 'http://nodejs.cn/api/module.html#module_class_module_sourcemap', - // 14.9.0 - // http://nodejs.cn/api/module.html - 'http://nodejs.cn/api/modules/modules_module.html#modules_module_the_module_object': - 'http://nodejs.cn/api/module.html#module_the_module_object', - 'http://nodejs.cn/api/wiki.openssl.org/index.php/List_of_SSL_OP_Flags#Table_of_Options': - 'https://wiki.openssl.org/index.php/List_of_SSL_OP_Flags#Table_of_Options', - // 16.4.0 - '/api/http_new_agent_options': - 'http://nodejs.cn/api/http.html#http_new_agent_options' -}; - -const redirectCache: Record> = {}; -const redirectFullPathCache: Record> = {}; - -const getRedirectCached = ( - cache: typeof redirectCache, base: Record -) => (options?: StaticDownloadOptions): Record => { - const nodeApiPath = options?.meta?.nodeApiPath as string | undefined; - let result: Record; - if (nodeApiPath) { - result = cache[nodeApiPath]; - if (!result) { - result = {}; - const search = /\/api\//; - const replace = `/${nodeApiPath}/`; - for (const [k, v] of Object.entries(base)) { - result[k.replace(search, replace)] = v.replace(search, replace); - } - cache[nodeApiPath] = result; - } - return result; - } - return base; -}; - -const getRedirect = getRedirectCached(redirectCache, hardCodedRedirect); -const getRedirectFullPath = getRedirectCached( - redirectFullPathCache, hardCodedRedirectFullPath); - const linkRedirectFunc = async ( link: string, elem: Cheerio | null, @@ -231,7 +88,7 @@ const linkRedirectFunc = async ( } } } - const redirectLink = getRedirectFullPath(options)[link]; + const redirectLink = hardCodedRedirectFullPath[link]; if (redirectLink) { link = redirectLink; } @@ -255,7 +112,7 @@ const linkRedirectFunc = async ( u.path(pathArr.join('/')); link = u.toString(); } - const redirect = getRedirect(options); + const redirect = hardCodedRedirect; if (redirect[u.path()]) { u = u.path(redirect[u.path()]); link = u.toString(); @@ -313,11 +170,11 @@ const preProcessResource = ( } }; -const preProcessHtml: ProcessResourceAfterDownloadFunc = ( +const preProcessHtml: ProcessResourceAfterDownloadFunc = async ( res: DownloadResource, submit: SubmitResourceFunc, options: StaticDownloadOptions -): DownloadResource => { +): Promise => { if (res.type !== ResourceType.Html) { return res; } @@ -327,13 +184,18 @@ const preProcessHtml: ProcessResourceAfterDownloadFunc = ( const $ = res.meta.doc; const head = $('head'), body = $('body'); // remove comments in body - // note the 'this' hack, nodeType is actually defined - body.contents().filter(function (this: { nodeType: number }) { + body.contents().filter(function (this) { return this.nodeType === 8; }).remove(); + + // decrypt the stuffs behind login wall + await decryptContent($, res.url, options); + $('#biz_nav').remove(); $('#biz_content').remove(); $('#biz_item').remove(); + // login stuff + $('#btn_login,#btn_logout,#wxcode_box').remove(); // remove all scripts $('script').remove(); $('a[href="/"]').remove(); @@ -360,6 +222,12 @@ const preProcessHtml: ProcessResourceAfterDownloadFunc = ( $('`).appendTo(head); + if (api && api !== 'api' && options?.meta?.replaceNodeApiPath) { + const el = $('#alt-docs').parent().parent(); + if (el.is('li.picker-header')) { + el.remove(); + } + } return res; }; @@ -397,7 +265,18 @@ const postProcessSavePath = ( return res; }; +const initNodeApiPathFromOptions = ( + pipeline: PipelineExecutor, downloader?: DownloaderWithMeta +) => { + const options = downloader?.options; + const api = options?.meta?.nodeApiPath; + if (api && typeof api === 'string') { + initNodeApiPath(api); + } +}; + const lifeCycle: ProcessingLifeCycle = defaultLifeCycle(); +lifeCycle.init.push(initNodeApiPathFromOptions); lifeCycle.linkRedirect.push(skipProcess( (link: string) => !link || link.startsWith('https://github.com/'))); lifeCycle.linkRedirect.push(linkRedirectFunc);