-
-
Notifications
You must be signed in to change notification settings - Fork 2
/
commons.go
107 lines (95 loc) · 3.17 KB
/
commons.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
package mediawiki
import (
"context"
"fmt"
"strings"
"github.com/elliotchance/phpserialize"
"github.com/hashicorp/go-retryablehttp"
"gitlab.com/tozd/go/errors"
"gitlab.com/tozd/go/x"
)
// LatestCommonsEntitiesRun returns URL of the latest run of Wikimedia Commons entities JSON dump.
func LatestCommonsEntitiesRun(ctx context.Context, client *retryablehttp.Client) (string, errors.E) {
return latestRun(
ctx,
client,
"https://dumps.wikimedia.org/commonswiki/entities/",
"https://dumps.wikimedia.org/commonswiki/entities/%s/commons-%s-mediainfo.json.bz2",
)
}
// ProcessCommonsEntitiesDump downloads (unless already saved), decompresses, decodes JSON,
// and calls processEntity on every entity in a Wikimedia Commons entities JSON dump.
func ProcessCommonsEntitiesDump(
ctx context.Context, config *ProcessDumpConfig,
processEntity func(context.Context, Entity) errors.E,
) errors.E {
return Process(ctx, &ProcessConfig[commonsEntity]{
URL: config.URL,
Path: config.Path,
Client: config.Client,
DecompressionThreads: config.DecompressionThreads,
DecodingThreads: config.DecodingThreads,
ItemsProcessingThreads: config.ItemsProcessingThreads,
Process: func(ctx context.Context, i commonsEntity) errors.E {
return processEntity(ctx, Entity(i))
},
Progress: config.Progress,
FileType: JSONArray,
Compression: BZIP2,
})
}
func convertToStringMaps(value interface{}) interface{} {
switch v := value.(type) {
case []interface{}:
for i, el := range v {
v[i] = convertToStringMaps(el)
}
case map[interface{}]interface{}:
return convertToStringMapsMap(v)
}
return value
}
func convertToStringMapsMap(m map[interface{}]interface{}) map[string]interface{} {
out := make(map[string]interface{})
for key, value := range m {
out[fmt.Sprint(key)] = convertToStringMaps(value)
}
return out
}
// LatestCommonsImageMetadataRun returns URL of the latest run of Wikimedia Commons image table dump.
func LatestCommonsImageMetadataRun(ctx context.Context, client *retryablehttp.Client) (string, errors.E) {
return latestRun(
ctx,
client,
"https://dumps.wikimedia.org/commonswiki/",
"https://dumps.wikimedia.org/commonswiki/%s/commonswiki-%s-image.sql.gz",
)
}
// DecodeImageMetadata decodes image and other uploaded files metadata column in
// image table. See: https://www.mediawiki.org/wiki/Manual:Image_table
func DecodeImageMetadata(metadata interface{}) (map[string]interface{}, errors.E) {
if metadata == "" || metadata == "0" || metadata == "-1" {
return make(map[string]interface{}), nil
}
m, ok := metadata.(string)
if !ok {
errE := errors.WithMessage(ErrUnexpectedType, "metadata")
errors.Details(errE)["expected"] = "string" //nolint:goconst
errors.Details(errE)["type"] = fmt.Sprintf("%T", metadata)
return nil, errE
}
if strings.HasPrefix(m, "{") {
var d map[string]interface{}
errE := x.Unmarshal([]byte(m), &d)
if errE != nil {
return nil, errE
}
return d, nil
}
var d map[interface{}]interface{}
err := phpserialize.Unmarshal([]byte(m), &d)
if err != nil {
return nil, errors.WithMessage(err, "phpserialize unmarshal")
}
return convertToStringMapsMap(d), nil
}