forked from unidoc/unioffice
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathxsdany.go
269 lines (246 loc) · 7.12 KB
/
xsdany.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
// Copyright 2017 FoxyUtils ehf. All rights reserved.
//
// Use of this software package and source code is governed by the terms of the
// UniDoc End User License Agreement (EULA) that is available at:
// https://unidoc.io/eula/
// A trial license code for evaluation can be obtained at https://unidoc.io.
package unioffice
import (
"encoding/xml"
"strings"
"unicode"
)
// XSDAny is used to marshal/unmarshal xsd:any types in the OOXML schema.
type XSDAny struct {
XMLName xml.Name
Attrs []xml.Attr
Data []byte
Nodes []*XSDAny
}
var wellKnownSchemas = map[string]string{
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
"dc": "http://purl.org/dc/elements/1.1/",
"dcterms": "http://purl.org/dc/terms/",
"mc": "http://schemas.openxmlformats.org/markup-compatibility/2006",
"mo": "http://schemas.microsoft.com/office/mac/office/2008/main",
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
"w10": "urn:schemas-microsoft-com:office:word",
"w14": "http://schemas.microsoft.com/office/word/2010/wordml",
"w15": "http://schemas.microsoft.com/office/word/2012/wordml",
"wne": "http://schemas.microsoft.com/office/word/2006/wordml",
"wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
"wp14": "http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing",
"wpc": "http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas",
"wpg": "http://schemas.microsoft.com/office/word/2010/wordprocessingGroup",
"wpi": "http://schemas.microsoft.com/office/word/2010/wordprocessingInk",
"wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape",
"xsi": "http://www.w3.org/2001/XMLSchema-instance",
"x15ac": "http://schemas.microsoft.com/office/spreadsheetml/2010/11/ac",
"w16se": "http://schemas.microsoft.com/office/word/2015/wordml/symex",
"w16cid": "http://schemas.microsoft.com/office/word/2016/wordml/cid",
"w16": "http://schemas.microsoft.com/office/word/2018/wordml",
"w16cex": "http://schemas.microsoft.com/office/word/2018/wordml/cex",
}
var wellKnownSchemasInv = func() map[string]string {
r := map[string]string{}
for pfx, ns := range wellKnownSchemas {
r[ns] = pfx
}
return r
}()
type any struct {
XMLName xml.Name
Attrs []xml.Attr `xml:",any,attr"`
Nodes []*any `xml:",any"`
Data []byte `xml:",chardata"`
}
func dd(a *any) {
for _, n := range a.Nodes {
dd(n)
}
}
// UnmarshalXML implements the xml.Unmarshaler interface.
func (x *XSDAny) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
a := any{}
if err := d.DecodeElement(&a, &start); err != nil {
return err
}
dd(&a)
x.XMLName = a.XMLName
x.Attrs = a.Attrs
x.Data = a.Data
x.Nodes = convertToXNodes(a.Nodes)
return nil
}
type nsSet struct {
urlToPrefix map[string]string
prefixToURL map[string]string
prefixes []string //required for deterministic output
}
func (n *nsSet) getPrefix(ns string) string {
// Common namespaces are used in these 'any' elements and some versions
// of Word really want to the prefix to match what they write out. This
// occurred primarily with docProps/core.xml
if pfx, ok := wellKnownSchemasInv[ns]; ok {
if _, ok := n.prefixToURL[pfx]; !ok {
n.prefixToURL[pfx] = ns
n.urlToPrefix[ns] = pfx
n.prefixes = append(n.prefixes, pfx)
}
return pfx
}
// trying to construct a decent looking valid prefix
ns = strings.TrimFunc(ns, func(r rune) bool {
return !unicode.IsLetter(r)
})
// do we have a prefix for this ns?
if sc, ok := n.urlToPrefix[ns]; ok {
return sc
}
// determine the last path portion of the namespace
// "urn:schemas-microsoft-com:office:office" = "office"
// "http://schemas.microsoft.com/office/word/2012/wordml" = "wordml"
split := strings.Split(ns, "/")
split = strings.Split(split[len(split)-1], ":")
// last segment of the namesapce
last := split[len(split)-1]
lng := 0
pfx := []byte{}
for {
if lng < len(last) {
pfx = append(pfx, last[lng])
} else {
pfx = append(pfx, '_')
}
lng++
// is this prefix unused?
if _, ok := n.prefixToURL[string(pfx)]; !ok {
n.prefixToURL[string(pfx)] = ns
n.urlToPrefix[ns] = string(pfx)
n.prefixes = append(n.prefixes, string(pfx))
return string(pfx)
}
}
}
var ignorables = map[string]bool{
"w10": true,
"w14": true,
"wp14": true,
"w15": true,
"x15ac": true,
"w16se": true,
"w16cid": true,
"w16": true,
"w16cex": true,
}
func (n nsSet) applyToNode(a *any) {
if a.XMLName.Space == "" {
return
}
pfx := n.getPrefix(a.XMLName.Space)
a.XMLName.Space = ""
a.XMLName.Local = pfx + ":" + a.XMLName.Local
tmpAttr := a.Attrs
a.Attrs = nil
for _, attr := range tmpAttr {
// skip namespace prefix declaration atributes as we create them later
if attr.Name.Space == "xmlns" {
continue
}
if attr.Name.Space != "" {
pfx := n.getPrefix(attr.Name.Space)
attr.Name.Space = ""
attr.Name.Local = pfx + ":" + attr.Name.Local
}
a.Attrs = append(a.Attrs, attr)
}
for _, cn := range a.Nodes {
n.applyToNode(cn)
}
}
// collectNS walks a tree of nodes finding any non-default namespace being used
func (x *XSDAny) collectNS(ns *nsSet) {
if x.XMLName.Space != "" {
ns.getPrefix(x.XMLName.Space)
}
for _, attr := range x.Attrs {
if attr.Name.Space != "" && attr.Name.Space != "xmlns" {
ns.getPrefix(attr.Name.Space)
}
}
for _, n := range x.Nodes {
n.collectNS(ns)
}
}
func convertToXNodes(an []*any) []*XSDAny {
ret := []*XSDAny{}
for _, a := range an {
x := &XSDAny{}
x.XMLName = a.XMLName
x.Attrs = a.Attrs
x.Data = a.Data
x.Nodes = convertToXNodes(a.Nodes)
ret = append(ret, x)
}
return ret
}
func convertToNodes(xn []*XSDAny) []*any {
ret := []*any{}
for _, x := range xn {
a := &any{}
a.XMLName = x.XMLName
attrs := []xml.Attr{}
for _, attr := range x.Attrs {
if attr.Name.Local != "xmlns" {
attrs = append(attrs, attr)
}
}
a.Attrs = attrs
a.Data = x.Data
a.Nodes = convertToNodes(x.Nodes)
ret = append(ret, a)
}
return ret
}
// MarshalXML implements the xml.Marshaler interface.
func (x *XSDAny) MarshalXML(e *xml.Encoder, start xml.StartElement) error {
start.Name = x.XMLName
start.Attr = x.Attrs
a := any{}
a.XMLName = x.XMLName
a.Attrs = x.Attrs
a.Data = x.Data
a.Nodes = convertToNodes(x.Nodes)
attrsToIgnore := []string{}
includeIgnorable := false
ns := nsSet{
urlToPrefix: map[string]string{},
prefixToURL: map[string]string{},
}
// collect any namespaces in use in the node tree
x.collectNS(&ns)
// apply our new namespaces to the node and its children
ns.applyToNode(&a)
// add our prefixes and namespaces to root element
for _, pfx := range ns.prefixes {
if _, ok := ignorables[pfx]; ok {
attrsToIgnore = append(attrsToIgnore, pfx)
}
ns := ns.prefixToURL[pfx]
a.Attrs = append(a.Attrs, xml.Attr{
Name: xml.Name{Local: "xmlns:" + pfx},
Value: ns,
})
if pfx == "mc" {
includeIgnorable = true
}
}
if includeIgnorable && len(attrsToIgnore) > 0 {
a.Attrs = append(a.Attrs, xml.Attr{
Name: xml.Name{Local: "mc:Ignorable"},
Value: strings.Join(attrsToIgnore, " "),
})
}
// finally write out our new element
return e.Encode(&a)
}