Skip to content

Commit

Permalink
node-js: refactor link process and decrypt contents
Browse files Browse the repository at this point in the history
Recently the source website added some login walls, and some contents got encrypted after this.
This code decrypts and recovers the correct content with no need to login.
  • Loading branch information
myfreeer committed Jul 16, 2022
1 parent de72d4f commit 2176bfd
Show file tree
Hide file tree
Showing 5 changed files with 307 additions and 165 deletions.
12 changes: 11 additions & 1 deletion package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
"node": ">=12.16.0"
},
"dependencies": {
"@types/crypto-js": "^4.1.1",
"agentkeepalive": "^4.2.1",
"crypto-js": "^4.1.1",
"mkdirp": "^1.0.4",
"tough-cookie": "^4.0.0",
"urijs": "^1.19.11",
Expand Down
100 changes: 100 additions & 0 deletions src/nodejs-cn/decrypt-contents.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import AES from 'crypto-js/aes';
import Utf8 from 'crypto-js/enc-utf8';
import got from 'got';
import {error as errorLogger} from 'website-scrap-engine/lib/logger/logger';
import type {StaticDownloadOptions} from 'website-scrap-engine/lib/options';
import type {CheerioStatic} from 'website-scrap-engine/lib/types';

// the original decrypt and restore function
// function fn({docs, data, decrypt, enc, loadingDoms}) {
// docs.push(...(data.split('-a0a-').map(doc => {
// return decrypt(doc, 'qQ1').toString(enc.Utf8);
// })));
// let index = 0;
// for (let loadingDom of loadingDoms) {
// loadingDom.innerHTML = docs[index];
// index++;
// }
// }

const separator = '-a0a-';
// the real key is computed inside crypto-js using some kdf
const key = 'qQ1';

const decryptOne = (input: string): string =>
AES.decrypt(input, key).toString(Utf8);

const decrypt = (input: string): string[] =>
input.split(separator).map(decryptOne);

const regexp = /\.html$/;

// note this is base64 encoded text, not real ttf font
const replaceAs = '.ttf';

const fetchAndDecrypt = async (
url: string,
options: StaticDownloadOptions
): Promise<string[] | void> => {
const realContentUrl = url.replace(regexp, replaceAs);
if (realContentUrl === url) {
errorLogger.error('fetchAndDecrypt: bad url', url);
return;
}

const theGot = options?.req ? got.extend(options.req) : got;

const resp = await theGot(realContentUrl);

const body = resp.body;
if (!body?.length) {
errorLogger.error('fetchAndDecrypt: empty body', realContentUrl, resp);
return;
}
return decrypt(body);
};

const asyncCache: Record<string, Promise<string[] | void>> = {};


const cachedFetchAndDecrypt = async (
url: string,
options: StaticDownloadOptions
): Promise<string[] | void> => {
if (asyncCache[url] !== undefined) {
return asyncCache[url];
}
return asyncCache[url] = fetchAndDecrypt(url, options);
};

export async function decryptContent(
$: CheerioStatic, url: string, options: StaticDownloadOptions
): Promise<void> {
// the original selector from code and the original var name
const loadingDoms = $('#apicontent .loading');
if (!loadingDoms.length) {
return;
}
// the original var name
let docs: string[] | void;
try {
docs = await cachedFetchAndDecrypt(url, options);
} catch (e) {
errorLogger.error('cachedFetchAndDecrypt', url, e);
return;
}
if (!docs) {
errorLogger.warn('no content found', url);
return;
}
if (loadingDoms.length !== docs.length) {
errorLogger.warn('length mismatch',
url, loadingDoms.length, docs.length);
}
for (let i = 0; i < loadingDoms.length; i++) {
const decrypted = docs[i];
if (decrypted) {
loadingDoms.eq(i).removeClass('loading').html(decrypted);
}
}
}
151 changes: 151 additions & 0 deletions src/nodejs-cn/fix-link.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
import got from 'got';
import type {StaticDownloadOptions} from 'website-scrap-engine/lib/options';
import {error as errorLogger} from 'website-scrap-engine/lib/logger/logger';

const KW_ARR_BEGIN = 'var arr = [',
KW_ARR_END = '];',
KW_ARR_INDEX_BEGIN = 'location.replace(arr[';

const HOST = 'nodejs.cn',
PROTOCOL = 'http',
URL_PREFIX = `${PROTOCOL}://${HOST}`;

const LOCATION_REPLACE_LITERAL = 'location.replace(\'',
LOCATION_REPLACE_LITERAL_END = '\')';


const gotNoRedirect = got.extend({
followRedirect: false
});

export const cache: Record<string, string> = {};
const asyncRedirectCache: Record<string, Promise<string>> = {};

const getRedirectLocation = async (
link: string,
options: StaticDownloadOptions
): Promise<string> => {
// make sure that followRedirect is false here
const theGot = options?.req ? got.extend(options.req, {
followRedirect: false
}) : gotNoRedirect;
const redirect = await theGot(
link.startsWith('/s') ? URL_PREFIX + link : link);
if (redirect.statusCode === 302 && redirect.headers?.location) {
cache[link] = redirect.headers.location;
link = redirect.headers.location;
} else if (redirect.body) {
/**
* @type string
*/
const html = redirect.body;
const arrBegin = html.indexOf(KW_ARR_BEGIN),
arrEnd = html.indexOf(KW_ARR_END, arrBegin),
arrIndex = html.indexOf(KW_ARR_INDEX_BEGIN, arrEnd);
if (arrBegin > 0 && arrEnd > 0 && arrIndex > 0) {
try {
const arr = JSON.parse(html.slice(
arrBegin + KW_ARR_BEGIN.length - 1, arrEnd + 1));
const i = parseInt(html.slice(
arrIndex + KW_ARR_INDEX_BEGIN.length), 10);
if (arr && !isNaN(i) && arr[i]) {
cache[link] = arr[i];
link = arr[i];
} else {
errorLogger.warn('Can not parse redirect for', link, arr, i);
}
} catch (e) {
errorLogger.error('Error resolving redirect result', link, html, e);
}
} else {
// the new redirect page since 2021
const literalBegin = html.indexOf(LOCATION_REPLACE_LITERAL),
literalEnd = literalBegin > 0 ?
html.indexOf(LOCATION_REPLACE_LITERAL_END, literalBegin) : -1;
if (literalBegin > 0 && literalEnd > 0) {
link = html.slice(
literalBegin + LOCATION_REPLACE_LITERAL.length, literalEnd);
} else {
errorLogger.warn('Unknown redirect result format', link, html);
}
}
}
// replace the api to required version
if (options?.meta?.nodeApiPath) {
link = link.replace(`${URL_PREFIX}/api/`,
`${URL_PREFIX}/${options.meta.nodeApiPath}/`);
}
return link;
};

export const cachedGetRedirectLocation = (
link: string, options: StaticDownloadOptions
): string | Promise<string> => {
if (cache[link]) {
return cache[link];
}
if (asyncRedirectCache[link] !== undefined) {
return asyncRedirectCache[link];
}
return asyncRedirectCache[link] = getRedirectLocation(link, options);
};

// the 404-not-found links
const hardCodedRedirectBuilder = (api: string): Record<string, string> => ({
[`/${api}/stream.md`]: `/${api}/stream.html`,
[`/${api}/http/net.html`]: `/${api}/net.html`,
[`/${api}/fs/stream.html`]: `/${api}/stream.html`,
[`/${api}/addons/n-api.html`]: `/${api}/n-api.html`,
[`/${api}/assert/tty.html`]: `/${api}/tty.html`,
[`/${api}/worker_threads/errors.html`]: `/${api}/errors.html`,
[`/${api}/process/cli.html`]: `/${api}/cli.html`,
[`/${api}/zlib/buffer.html`]: `/${api}/buffer.html`,
[`/${api}/dgram/errors.html`]: `/${api}/errors.html`,
[`/${api}/net/stream.html`]: `/${api}/stream.html`,
[`/${api}/process/stream.html`]: `/${api}/stream.html`,
[`/${api}/worker_threads/fs.html`]: `/${api}/fs.html`,
// 14.12.0
[`/${api}/synopsis/cli.html`]: `/${api}/cli.html`,
// since 16.3.0
[`/${api}/modules/esm.md`]: `/${api}/esm.html`,
// since 18.4.0 maybe for api-v14 and api-v16
[`/${api}/esm.md`]: `/${api}/esm.html`,

});

const hardCodedRedirectFullPathBuilder = (api: string): Record<string, string> => ({
// 14.9.0
// http://nodejs.cn/api/module.html
[`http://${HOST}/${api}/modules_cjs.html#modules_cjs_the_module_wrapper`]:
`http://${HOST}/${api}/modules.html#modules_the_module_wrapper`,
// 14.9.0
// http://nodejs.cn/api/module.html
[`http://${HOST}/${api}/modules_module.html#modules_module_class_module_sourcemap`]:
`http://${HOST}/${api}/module.html#module_class_module_sourcemap`,
// 14.9.0
// http://nodejs.cn/api/module.html
[`http://${HOST}/${api}/modules/modules_module.html#modules_module_the_module_object`]:
`http://${HOST}/${api}/module.html#module_the_module_object`,
[`http://${HOST}/${api}/wiki.openssl.org/index.php/List_of_SSL_OP_Flags#Table_of_Options`]:
'https://wiki.openssl.org/index.php/List_of_SSL_OP_Flags#Table_of_Options',
// 16.4.0
[`/${api}/http_new_agent_options`]:
`http://${HOST}/${api}/http.html#http_new_agent_options`
});

export const hardCodedRedirect: Record<string, string> = {
...hardCodedRedirectBuilder('api'),
...hardCodedRedirectBuilder('api-v16'),
...hardCodedRedirectBuilder('api-v14'),
};

export const hardCodedRedirectFullPath: Record<string, string> = {
...hardCodedRedirectFullPathBuilder('api'),
...hardCodedRedirectFullPathBuilder('api-v16'),
...hardCodedRedirectFullPathBuilder('api-v14'),
};

export const initNodeApiPath = (api: string): void => {
Object.assign(hardCodedRedirect, hardCodedRedirectBuilder(api));
Object.assign(hardCodedRedirectFullPath, hardCodedRedirectFullPathBuilder(api));
};
Loading

0 comments on commit 2176bfd

Please sign in to comment.