From fa4d8ddc5051095d76f84862bf3ab60a22c6dd0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=99=88=E6=99=AF=E9=98=B3?= <1656488874@qq.com> Date: Tue, 26 Nov 2019 15:26:01 +0800 Subject: [PATCH 01/21] =?UTF-8?q?fix=20=E6=89=A7=E8=A1=8C=E7=88=AC?= =?UTF-8?q?=E8=99=AB=E9=94=99=E8=AF=AF=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/model/task.go | 1 + backend/services/spider.go | 3 +-- backend/services/task.go | 18 +++++++++++++----- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/backend/model/task.go b/backend/model/task.go index 64f06cd71..b869b506e 100644 --- a/backend/model/task.go +++ b/backend/model/task.go @@ -61,6 +61,7 @@ func (t *Task) Save() error { defer s.Close() t.UpdateTs = time.Now() if err := c.UpdateId(t.Id, t); err != nil { + log.Errorf("update task error: %s", err.Error()) debug.PrintStack() return err } diff --git a/backend/services/spider.go b/backend/services/spider.go index 84d218bbd..d76f1f9ab 100644 --- a/backend/services/spider.go +++ b/backend/services/spider.go @@ -116,10 +116,9 @@ func PublishAllSpiders() { // 发布爬虫 func PublishSpider(spider model.Spider) { - // 查询gf file,不存在则删除 + // 可能爬虫文件不存在,则直接返回 gfFile := model.GetGridFs(spider.FileId) if gfFile == nil { - _ = model.RemoveSpider(spider.Id) return } spiderSync := spider_handler.SpiderSync{ diff --git a/backend/services/task.go b/backend/services/task.go index 67c3396b1..3cb1a12ca 100644 --- a/backend/services/task.go +++ b/backend/services/task.go @@ -391,15 +391,23 @@ func ExecuteTask(id int) { t.Status = constants.StatusRunning // 任务状态 t.WaitDuration = t.StartTs.Sub(t.CreateTs).Seconds() // 等待时长 + // 判断爬虫文件是否存在 + gfFile := model.GetGridFs(spider.FileId) + if gfFile == nil { + t.Error = "找不到爬虫文件,请重新上传" + t.Status = constants.StatusError + t.FinishTs = time.Now() // 结束时间 + t.RuntimeDuration = t.FinishTs.Sub(t.StartTs).Seconds() // 运行时长 + t.TotalDuration = t.FinishTs.Sub(t.CreateTs).Seconds() // 总时长 + _ = t.Save() + return + } + // 开始执行任务 log.Infof(GetWorkerPrefix(id) + "开始执行任务(ID:" + t.Id + ")") // 储存任务 - if err := t.Save(); err != nil { - log.Errorf(err.Error()) - HandleTaskError(t, err) - return - } + _ = t.Save() // 起一个cron执行器来统计任务结果数 if spider.Col != "" { From d5cfb045605d165a01240e1e56790e7a1bf544c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=99=88=E6=99=AF=E9=98=B3?= <1656488874@qq.com> Date: Sat, 7 Dec 2019 10:58:09 +0800 Subject: [PATCH 02/21] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E6=B3=A8=E9=87=8A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- frontend/src/store/modules/spider.js | 1 - frontend/src/views/schedule/ScheduleList.vue | 4 +++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/frontend/src/store/modules/spider.js b/frontend/src/store/modules/spider.js index f4d7b1349..7ff9324a4 100644 --- a/frontend/src/store/modules/spider.js +++ b/frontend/src/store/modules/spider.js @@ -1,6 +1,5 @@ import Vue from 'vue' import request from '../../api/request' -import axisModelCommonMixin from 'echarts/src/coord/axisModelCommonMixin' const state = { // list of spiders diff --git a/frontend/src/views/schedule/ScheduleList.vue b/frontend/src/views/schedule/ScheduleList.vue index 3a032b230..44a525173 100644 --- a/frontend/src/views/schedule/ScheduleList.vue +++ b/frontend/src/views/schedule/ScheduleList.vue @@ -115,9 +115,11 @@ @@ -162,6 +168,7 @@ export default { columns: [ { name: 'name', label: 'Name', width: '180' }, { name: 'cron', label: 'Cron', width: '120' }, + { name: 'run_type', label: 'Run Type', width: '150' }, { name: 'node_name', label: 'Node', width: '150' }, { name: 'spider_name', label: 'Spider', width: '150' }, { name: 'param', label: 'Parameters', width: '150' }, @@ -204,7 +211,7 @@ export default { onAdd () { this.isEdit = false this.dialogVisible = true - this.$store.commit('schedule/SET_SCHEDULE_FORM', {}) + this.$store.commit('schedule/SET_SCHEDULE_FORM', { node_ids: [] }) this.$st.sendEv('定时任务', '添加') }, onAddSubmit () { @@ -308,6 +315,15 @@ export default { } else { return false } + }, + getStatusTooltip (row) { + if (row.status === 'stop') { + return 'Start' + } else if (row.status === 'running') { + return 'Stop' + } else if (row.status === 'error') { + return 'Start' + } } }, created () { From be9598abbc2b0740f1b408f3cfc8da3ff812a9f4 Mon Sep 17 00:00:00 2001 From: marvzhang Date: Thu, 12 Dec 2019 13:47:43 +0800 Subject: [PATCH 14/21] updated CHANGELOG.md --- CHANGELOG.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 95ef9cd76..93d315a2f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,14 @@ +# 0.4.0 (2019-12-06) +### Features / Enhancement +- **Configurable Spider**. Allow users to add spiders using *Spiderfile* to configure crawling rules. +- **Execution Mode**. Allow users to select 3 modes for task execution: *All Nodes*, *Selected Nodes* and *Random*. + +### Bug Fixes +- **Task accidentally killed**. [#306](https://github.com/crawlab-team/crawlab/issues/306) +- **Documentation fix**. [#301](https://github.com/crawlab-team/crawlab/issues/258) [#301](https://github.com/crawlab-team/crawlab/issues/258) +- **Direct deploy incompatible with Windows**. [#288](https://github.com/crawlab-team/crawlab/issues/288) +- **Log files lost**. [#269](https://github.com/crawlab-team/crawlab/issues/269) + # 0.3.5 (2019-10-28) ### Features / Enhancement - **Graceful Showdown**. [detail](https://github.com/crawlab-team/crawlab/commit/63fab3917b5a29fd9770f9f51f1572b9f0420385) From a067c1c1adfa4e2c476509894a34f92316b23ebd Mon Sep 17 00:00:00 2001 From: marvzhang Date: Fri, 13 Dec 2019 12:55:53 +0800 Subject: [PATCH 15/21] =?UTF-8?q?=E5=B0=86=E5=8F=AF=E9=85=8D=E7=BD=AE?= =?UTF-8?q?=E7=88=AC=E8=99=ABstages=E8=B0=83=E6=95=B4=E4=B8=BA=E5=88=97?= =?UTF-8?q?=E8=A1=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/entity/config_spider.go | 2 +- backend/model/config_spider/common.go | 10 ++--- backend/model/config_spider/scrapy.go | 3 +- backend/model/spider.go | 6 --- backend/services/config_spider.go | 10 ++--- .../template/spiderfile/Spiderfile.163_news | 22 +++++------ backend/template/spiderfile/Spiderfile.baidu | 26 ++++++------- .../spiderfile/Spiderfile.toscrapy_books | 38 +++++++++---------- 8 files changed, 52 insertions(+), 65 deletions(-) diff --git a/backend/entity/config_spider.go b/backend/entity/config_spider.go index 3fe28bc95..d9e085d2e 100644 --- a/backend/entity/config_spider.go +++ b/backend/entity/config_spider.go @@ -5,7 +5,7 @@ type ConfigSpiderData struct { Engine string `yaml:"engine" json:"engine"` StartUrl string `yaml:"start_url" json:"start_url"` StartStage string `yaml:"start_stage" json:"start_stage"` - Stages map[string]Stage `yaml:"stages" json:"stages"` + Stages []Stage `yaml:"stages" json:"stages"` Settings map[string]string `yaml:"settings" json:"settings"` } diff --git a/backend/model/config_spider/common.go b/backend/model/config_spider/common.go index c803755ac..4d244fe19 100644 --- a/backend/model/config_spider/common.go +++ b/backend/model/config_spider/common.go @@ -15,16 +15,12 @@ func GetAllFields(data entity.ConfigSpiderData) []entity.Field { func GetStartStageName(data entity.ConfigSpiderData) string { // 如果 start_stage 设置了且在 stages 里,则返回 if data.StartStage != "" { - for stageName := range data.Stages { - if stageName == data.StartStage { - return data.StartStage - } - } + return data.StartStage } // 否则返回第一个 stage - for stageName := range data.Stages { - return stageName + for _, stage := range data.Stages { + return stage.Name } return "" } diff --git a/backend/model/config_spider/scrapy.go b/backend/model/config_spider/scrapy.go index 6fcb77f02..ee24a3e78 100644 --- a/backend/model/config_spider/scrapy.go +++ b/backend/model/config_spider/scrapy.go @@ -83,7 +83,8 @@ func (g ScrapyGenerator) ProcessSpider() error { // 替换 parsers strParser := "" - for stageName, stage := range g.ConfigData.Stages { + for _, stage := range g.ConfigData.Stages { + stageName := stage.Name stageStr := g.GetParserString(stageName, stage) strParser += stageStr } diff --git a/backend/model/spider.go b/backend/model/spider.go index a0d72c1cd..78adc4d0c 100644 --- a/backend/model/spider.go +++ b/backend/model/spider.go @@ -319,11 +319,5 @@ func GetConfigSpiderData(spider Spider) (entity.ConfigSpiderData, error) { return configData, err } - // 赋值 stage_name - for stageName, stage := range configData.Stages { - stage.Name = stageName - configData.Stages[stageName] = stage - } - return configData, nil } diff --git a/backend/services/config_spider.go b/backend/services/config_spider.go index 7c736cc71..fe0a3da14 100644 --- a/backend/services/config_spider.go +++ b/backend/services/config_spider.go @@ -61,7 +61,9 @@ func ValidateSpiderfile(configData entity.ConfigSpiderData) error { // 校验stages dict := map[string]int{} - for stageName, stage := range configData.Stages { + for _, stage := range configData.Stages { + stageName := stage.Name + // stage 名称不能为空 if stageName == "" { return errors.New("spiderfile invalid: stage name is empty") @@ -152,12 +154,6 @@ func IsUniqueConfigSpiderFields(fields []entity.Field) bool { func ProcessSpiderFilesFromConfigData(spider model.Spider, configData entity.ConfigSpiderData) error { spiderDir := spider.Src - // 赋值 stage_name - for stageName, stage := range configData.Stages { - stage.Name = stageName - configData.Stages[stageName] = stage - } - // 删除已有的爬虫文件 for _, fInfo := range utils.ListDir(spiderDir) { // 不删除Spiderfile diff --git a/backend/template/spiderfile/Spiderfile.163_news b/backend/template/spiderfile/Spiderfile.163_news index 29d58279a..c2a73be7c 100644 --- a/backend/template/spiderfile/Spiderfile.163_news +++ b/backend/template/spiderfile/Spiderfile.163_news @@ -4,17 +4,17 @@ start_url: "http://news.163.com/special/0001386F/rank_news.html" start_stage: "list" engine: "scrapy" stages: - list: - is_list: true - list_css: "table tr:not(:first-child)" - fields: - - name: "title" - css: "td:nth-child(1) > a" - - name: "url" - css: "td:nth-child(1) > a" - attr: "href" - - name: "clicks" - css: "td.cBlue" +- name: list + is_list: true + list_css: "table tr:not(:first-child)" + fields: + - name: "title" + css: "td:nth-child(1) > a" + - name: "url" + css: "td:nth-child(1) > a" + attr: "href" + - name: "clicks" + css: "td.cBlue" settings: ROBOTSTXT_OBEY: false USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/template/spiderfile/Spiderfile.baidu b/backend/template/spiderfile/Spiderfile.baidu index fbf720e4b..5643c9801 100644 --- a/backend/template/spiderfile/Spiderfile.baidu +++ b/backend/template/spiderfile/Spiderfile.baidu @@ -4,19 +4,19 @@ start_url: http://www.baidu.com/s?wd=crawlab start_stage: list engine: scrapy stages: - list: - is_list: true - list_xpath: //*[contains(@class, "c-container")] - page_xpath: //*[@id="page"]//a[@class="n"][last()] - page_attr: href - fields: - - name: title - xpath: .//h3/a - - name: url - xpath: .//h3/a - attr: href - - name: abstract - xpath: .//*[@class="c-abstract"] +- name: list + is_list: true + list_xpath: //*[contains(@class, "c-container")] + page_xpath: //*[@id="page"]//a[@class="n"][last()] + page_attr: href + fields: + - name: title + xpath: .//h3/a + - name: url + xpath: .//h3/a + attr: href + - name: abstract + xpath: .//*[@class="c-abstract"] settings: ROBOTSTXT_OBEY: false USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/template/spiderfile/Spiderfile.toscrapy_books b/backend/template/spiderfile/Spiderfile.toscrapy_books index 4bf18f617..247b4f40a 100644 --- a/backend/template/spiderfile/Spiderfile.toscrapy_books +++ b/backend/template/spiderfile/Spiderfile.toscrapy_books @@ -4,25 +4,25 @@ start_url: "http://books.toscrape.com" start_stage: "list" engine: "scrapy" stages: - list: - is_list: true - list_css: "section article.product_pod" - page_css: "ul.pager li.next a" - page_attr: "href" - fields: - - name: "title" - css: "h3 > a" - - name: "url" - css: "h3 > a" - attr: "href" - next_stage: "detail" - - name: "price" - css: ".product_price > .price_color" - detail: - is_list: false - fields: - - name: "description" - css: "#product_description + p" +- name: list + is_list: true + list_css: "section article.product_pod" + page_css: "ul.pager li.next a" + page_attr: "href" + fields: + - name: "title" + css: "h3 > a" + - name: "url" + css: "h3 > a" + attr: "href" + next_stage: "detail" + - name: "price" + css: ".product_price > .price_color" +- name: detail + is_list: false + fields: + - name: "description" + css: "#product_description + p" settings: ROBOTSTXT_OBEY: true AUTOTHROTTLE_ENABLED: true From 3c5a882b75e91af4916d5b379a419ec31d61cb09 Mon Sep 17 00:00:00 2001 From: marvzhang Date: Fri, 13 Dec 2019 13:01:00 +0800 Subject: [PATCH 16/21] =?UTF-8?q?=E5=B0=86=E5=8F=AF=E9=85=8D=E7=BD=AE?= =?UTF-8?q?=E7=88=AC=E8=99=ABstages=E8=B0=83=E6=95=B4=E4=B8=BA=E5=88=97?= =?UTF-8?q?=E8=A1=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- frontend/src/components/Config/ConfigList.vue | 96 +++++++++++-------- 1 file changed, 56 insertions(+), 40 deletions(-) diff --git a/frontend/src/components/Config/ConfigList.vue b/frontend/src/components/Config/ConfigList.vue index 5c7a9dc21..e7c799705 100644 --- a/frontend/src/components/Config/ConfigList.vue +++ b/frontend/src/components/Config/ConfigList.vue @@ -133,9 +133,9 @@ :value="activeNames" >