-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
node-js: refactor link process and decrypt contents
Recently the source website added some login walls, and some contents got encrypted after this. This code decrypts and recovers the correct content with no need to login.
- Loading branch information
Showing
5 changed files
with
307 additions
and
165 deletions.
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
import AES from 'crypto-js/aes'; | ||
import Utf8 from 'crypto-js/enc-utf8'; | ||
import got from 'got'; | ||
import {error as errorLogger} from 'website-scrap-engine/lib/logger/logger'; | ||
import type {StaticDownloadOptions} from 'website-scrap-engine/lib/options'; | ||
import type {CheerioStatic} from 'website-scrap-engine/lib/types'; | ||
|
||
// the original decrypt and restore function | ||
// function fn({docs, data, decrypt, enc, loadingDoms}) { | ||
// docs.push(...(data.split('-a0a-').map(doc => { | ||
// return decrypt(doc, 'qQ1').toString(enc.Utf8); | ||
// }))); | ||
// let index = 0; | ||
// for (let loadingDom of loadingDoms) { | ||
// loadingDom.innerHTML = docs[index]; | ||
// index++; | ||
// } | ||
// } | ||
|
||
const separator = '-a0a-'; | ||
// the real key is computed inside crypto-js using some kdf | ||
const key = 'qQ1'; | ||
|
||
const decryptOne = (input: string): string => | ||
AES.decrypt(input, key).toString(Utf8); | ||
|
||
const decrypt = (input: string): string[] => | ||
input.split(separator).map(decryptOne); | ||
|
||
const regexp = /\.html$/; | ||
|
||
// note this is base64 encoded text, not real ttf font | ||
const replaceAs = '.ttf'; | ||
|
||
const fetchAndDecrypt = async ( | ||
url: string, | ||
options: StaticDownloadOptions | ||
): Promise<string[] | void> => { | ||
const realContentUrl = url.replace(regexp, replaceAs); | ||
if (realContentUrl === url) { | ||
errorLogger.error('fetchAndDecrypt: bad url', url); | ||
return; | ||
} | ||
|
||
const theGot = options?.req ? got.extend(options.req) : got; | ||
|
||
const resp = await theGot(realContentUrl); | ||
|
||
const body = resp.body; | ||
if (!body?.length) { | ||
errorLogger.error('fetchAndDecrypt: empty body', realContentUrl, resp); | ||
return; | ||
} | ||
return decrypt(body); | ||
}; | ||
|
||
const asyncCache: Record<string, Promise<string[] | void>> = {}; | ||
|
||
|
||
const cachedFetchAndDecrypt = async ( | ||
url: string, | ||
options: StaticDownloadOptions | ||
): Promise<string[] | void> => { | ||
if (asyncCache[url] !== undefined) { | ||
return asyncCache[url]; | ||
} | ||
return asyncCache[url] = fetchAndDecrypt(url, options); | ||
}; | ||
|
||
export async function decryptContent( | ||
$: CheerioStatic, url: string, options: StaticDownloadOptions | ||
): Promise<void> { | ||
// the original selector from code and the original var name | ||
const loadingDoms = $('#apicontent .loading'); | ||
if (!loadingDoms.length) { | ||
return; | ||
} | ||
// the original var name | ||
let docs: string[] | void; | ||
try { | ||
docs = await cachedFetchAndDecrypt(url, options); | ||
} catch (e) { | ||
errorLogger.error('cachedFetchAndDecrypt', url, e); | ||
return; | ||
} | ||
if (!docs) { | ||
errorLogger.warn('no content found', url); | ||
return; | ||
} | ||
if (loadingDoms.length !== docs.length) { | ||
errorLogger.warn('length mismatch', | ||
url, loadingDoms.length, docs.length); | ||
} | ||
for (let i = 0; i < loadingDoms.length; i++) { | ||
const decrypted = docs[i]; | ||
if (decrypted) { | ||
loadingDoms.eq(i).removeClass('loading').html(decrypted); | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,151 @@ | ||
import got from 'got'; | ||
import type {StaticDownloadOptions} from 'website-scrap-engine/lib/options'; | ||
import {error as errorLogger} from 'website-scrap-engine/lib/logger/logger'; | ||
|
||
const KW_ARR_BEGIN = 'var arr = [', | ||
KW_ARR_END = '];', | ||
KW_ARR_INDEX_BEGIN = 'location.replace(arr['; | ||
|
||
const HOST = 'nodejs.cn', | ||
PROTOCOL = 'http', | ||
URL_PREFIX = `${PROTOCOL}://${HOST}`; | ||
|
||
const LOCATION_REPLACE_LITERAL = 'location.replace(\'', | ||
LOCATION_REPLACE_LITERAL_END = '\')'; | ||
|
||
|
||
const gotNoRedirect = got.extend({ | ||
followRedirect: false | ||
}); | ||
|
||
export const cache: Record<string, string> = {}; | ||
const asyncRedirectCache: Record<string, Promise<string>> = {}; | ||
|
||
const getRedirectLocation = async ( | ||
link: string, | ||
options: StaticDownloadOptions | ||
): Promise<string> => { | ||
// make sure that followRedirect is false here | ||
const theGot = options?.req ? got.extend(options.req, { | ||
followRedirect: false | ||
}) : gotNoRedirect; | ||
const redirect = await theGot( | ||
link.startsWith('/s') ? URL_PREFIX + link : link); | ||
if (redirect.statusCode === 302 && redirect.headers?.location) { | ||
cache[link] = redirect.headers.location; | ||
link = redirect.headers.location; | ||
} else if (redirect.body) { | ||
/** | ||
* @type string | ||
*/ | ||
const html = redirect.body; | ||
const arrBegin = html.indexOf(KW_ARR_BEGIN), | ||
arrEnd = html.indexOf(KW_ARR_END, arrBegin), | ||
arrIndex = html.indexOf(KW_ARR_INDEX_BEGIN, arrEnd); | ||
if (arrBegin > 0 && arrEnd > 0 && arrIndex > 0) { | ||
try { | ||
const arr = JSON.parse(html.slice( | ||
arrBegin + KW_ARR_BEGIN.length - 1, arrEnd + 1)); | ||
const i = parseInt(html.slice( | ||
arrIndex + KW_ARR_INDEX_BEGIN.length), 10); | ||
if (arr && !isNaN(i) && arr[i]) { | ||
cache[link] = arr[i]; | ||
link = arr[i]; | ||
} else { | ||
errorLogger.warn('Can not parse redirect for', link, arr, i); | ||
} | ||
} catch (e) { | ||
errorLogger.error('Error resolving redirect result', link, html, e); | ||
} | ||
} else { | ||
// the new redirect page since 2021 | ||
const literalBegin = html.indexOf(LOCATION_REPLACE_LITERAL), | ||
literalEnd = literalBegin > 0 ? | ||
html.indexOf(LOCATION_REPLACE_LITERAL_END, literalBegin) : -1; | ||
if (literalBegin > 0 && literalEnd > 0) { | ||
link = html.slice( | ||
literalBegin + LOCATION_REPLACE_LITERAL.length, literalEnd); | ||
} else { | ||
errorLogger.warn('Unknown redirect result format', link, html); | ||
} | ||
} | ||
} | ||
// replace the api to required version | ||
if (options?.meta?.nodeApiPath) { | ||
link = link.replace(`${URL_PREFIX}/api/`, | ||
`${URL_PREFIX}/${options.meta.nodeApiPath}/`); | ||
} | ||
return link; | ||
}; | ||
|
||
export const cachedGetRedirectLocation = ( | ||
link: string, options: StaticDownloadOptions | ||
): string | Promise<string> => { | ||
if (cache[link]) { | ||
return cache[link]; | ||
} | ||
if (asyncRedirectCache[link] !== undefined) { | ||
return asyncRedirectCache[link]; | ||
} | ||
return asyncRedirectCache[link] = getRedirectLocation(link, options); | ||
}; | ||
|
||
// the 404-not-found links | ||
const hardCodedRedirectBuilder = (api: string): Record<string, string> => ({ | ||
[`/${api}/stream.md`]: `/${api}/stream.html`, | ||
[`/${api}/http/net.html`]: `/${api}/net.html`, | ||
[`/${api}/fs/stream.html`]: `/${api}/stream.html`, | ||
[`/${api}/addons/n-api.html`]: `/${api}/n-api.html`, | ||
[`/${api}/assert/tty.html`]: `/${api}/tty.html`, | ||
[`/${api}/worker_threads/errors.html`]: `/${api}/errors.html`, | ||
[`/${api}/process/cli.html`]: `/${api}/cli.html`, | ||
[`/${api}/zlib/buffer.html`]: `/${api}/buffer.html`, | ||
[`/${api}/dgram/errors.html`]: `/${api}/errors.html`, | ||
[`/${api}/net/stream.html`]: `/${api}/stream.html`, | ||
[`/${api}/process/stream.html`]: `/${api}/stream.html`, | ||
[`/${api}/worker_threads/fs.html`]: `/${api}/fs.html`, | ||
// 14.12.0 | ||
[`/${api}/synopsis/cli.html`]: `/${api}/cli.html`, | ||
// since 16.3.0 | ||
[`/${api}/modules/esm.md`]: `/${api}/esm.html`, | ||
// since 18.4.0 maybe for api-v14 and api-v16 | ||
[`/${api}/esm.md`]: `/${api}/esm.html`, | ||
|
||
}); | ||
|
||
const hardCodedRedirectFullPathBuilder = (api: string): Record<string, string> => ({ | ||
// 14.9.0 | ||
// http://nodejs.cn/api/module.html | ||
[`http://${HOST}/${api}/modules_cjs.html#modules_cjs_the_module_wrapper`]: | ||
`http://${HOST}/${api}/modules.html#modules_the_module_wrapper`, | ||
// 14.9.0 | ||
// http://nodejs.cn/api/module.html | ||
[`http://${HOST}/${api}/modules_module.html#modules_module_class_module_sourcemap`]: | ||
`http://${HOST}/${api}/module.html#module_class_module_sourcemap`, | ||
// 14.9.0 | ||
// http://nodejs.cn/api/module.html | ||
[`http://${HOST}/${api}/modules/modules_module.html#modules_module_the_module_object`]: | ||
`http://${HOST}/${api}/module.html#module_the_module_object`, | ||
[`http://${HOST}/${api}/wiki.openssl.org/index.php/List_of_SSL_OP_Flags#Table_of_Options`]: | ||
'https://wiki.openssl.org/index.php/List_of_SSL_OP_Flags#Table_of_Options', | ||
// 16.4.0 | ||
[`/${api}/http_new_agent_options`]: | ||
`http://${HOST}/${api}/http.html#http_new_agent_options` | ||
}); | ||
|
||
export const hardCodedRedirect: Record<string, string> = { | ||
...hardCodedRedirectBuilder('api'), | ||
...hardCodedRedirectBuilder('api-v16'), | ||
...hardCodedRedirectBuilder('api-v14'), | ||
}; | ||
|
||
export const hardCodedRedirectFullPath: Record<string, string> = { | ||
...hardCodedRedirectFullPathBuilder('api'), | ||
...hardCodedRedirectFullPathBuilder('api-v16'), | ||
...hardCodedRedirectFullPathBuilder('api-v14'), | ||
}; | ||
|
||
export const initNodeApiPath = (api: string): void => { | ||
Object.assign(hardCodedRedirect, hardCodedRedirectBuilder(api)); | ||
Object.assign(hardCodedRedirectFullPath, hardCodedRedirectFullPathBuilder(api)); | ||
}; |
Oops, something went wrong.