Home > Blockchain >  How to get both the chardata and the value of the attributes of an XML tag when decoding it in Golan
How to get both the chardata and the value of the attributes of an XML tag when decoding it in Golan

Time:01-24

My XML file resembles to something like this:

<page>
    <title>Antoine Meillet</title>
    <ns>0</ns>
    <id>3</id>
    <revision>
      <id>178204512</id>
      <parentid>178097574</parentid>
      <timestamp>2020-12-30T10:12:14Z</timestamp>
      <contributor>
        <username>Rovo</username>
        <id>34820</id>
      </contributor>
      <minor />
      <model>wikitext</model>
      <format>text/x-wiki</format>
      <text bytes="11274" xml:space="preserve">
        a lot of text
      </text>
      <sha1>ikqy1f9ppwo8eo38a0hh817eynr40vg</sha1>
    </revision>
  </page>

My goal is to filter out a lot of those tags and only keep the page tag and those inner tags: title, id, text.

So far, I have been able to successfully extract the page tag with title and id having the right value. This is what I get:

<page>
 <title>Antoine Meillet</title>
 <id>3</id>
 <text bytes="0" xml:space=""></text>
</page>
<page>
 <title>Algèbre linéaire</title>
 <id>7</id>
 <text bytes="0" xml:space=""></text>
</page>

So the problem here as you can see is that the text tag doesn't have the right values for its attributes and the absence of text in it.

I have achieved this using this piece of code:

package main

import (
    "encoding/xml"
    "fmt"
    "io"
    "os"
)

type Page struct {
    XMLName xml.Name `xml:"page"`
    Title   string   `xml:"title"`
    Id      int64    `xml:"id"`
    Text    struct {
        Key   float32 `xml:"bytes,attr"`
        Space string  `xml:"xml:space,attr"`
    } `xml:"text"`
}

func main() {
    frwikiXML, err := os.Open("frwiki10000.xml")
    if err != nil {
        fmt.Println(err)
    }
    cleanedWikiXML, err := os.Create("cleaned_fr_wiki.xml")
    if err != nil {
        fmt.Println(err)
    }

    cleanXMLEncoder := xml.NewEncoder(cleanedWikiXML)
    cleanXMLEncoder.Indent("", " ")

    frwikiDecoder := xml.NewDecoder(frwikiXML)
    for {
        t, tokenErr := frwikiDecoder.Token()
        if tokenErr != nil {
            if tokenErr == io.EOF {
                break
            }
            fmt.Errorf("decoding token: %w", tokenErr)
        }
        switch t := t.(type) {
        case xml.StartElement:
            if t.Name.Local == "page" {
                var page Page
                if err := frwikiDecoder.DecodeElement(&page, &t); err != nil {
                    fmt.Errorf("decoding element %q: %v", t.Name.Local, err)
                }
                fmt.Println("Element was decoded successfully.")
                fmt.Printf("Page title: %v\n Page id: %d\n", page.Title, page.Id)
                fmt.Printf("Text: %v", page.Text)
                cleanXMLEncoder.Encode(page)
            }
        }
    }

    defer frwikiXML.Close()
    defer cleanedWikiXML.Close()
}

How would I be able to solve this problem, please?

Thanks.

CodePudding user response:

Simply decoding decoding to the struct and encoding again will satisfy your goal.

Please check this: https://go.dev/play/p/69vjlve4P6p

CodePudding user response:

To parse huge file xml file, use the standard xml Decoder.

Call Token to read tokens one by one. When a start element with required name is found ("page"), call DecodeElement to decode the element and prepare result to next actions.

type Page struct {
    XMLName  xml.Name `xml:"page"`
    Title    string   `xml:"title"`
    Id       int64    `xml:"id"`
    Revision struct {
        Text struct {
            Key   float32 `xml:"bytes,attr"`
            Space string  `xml:"xml:space,attr"`
        } `xml:"text"`
    } `xml:"revision"`
}

type PageTarget struct {
    XMLName xml.Name `xml:"page"`
    Title   string   `xml:"title"`
    Id      int64    `xml:"id"`
    Text    struct {
        Key   float32 `xml:"bytes,attr"`
        Space string  `xml:"xml:space,attr"`
    } `xml:"text"`
}
    dec := xml.NewDecoder(strings.NewReader(sample))
    
loop:
    for {
        tok, err := dec.Token()
        switch {
        case err != nil && err != io.EOF:
            panic(err)
        case err == io.EOF:
            break loop
        case tok == nil:
            fmt.Println("token is nill")

        }

        switch se := tok.(type) {
        case xml.StartElement:
            if se.Name.Local == "page" {
                var page Page
                if err := dec.DecodeElement(&page, &se); err != nil {
                    panic(err)
                }

                target := PageTarget{
                    XMLName: page.XMLName,
                    Id:      page.Id,
                    Title:   page.Title,
                    Text:    page.Revision.Text,
                }

                out, err := xml.MarshalIndent(target, " ", "  ")
                if err != nil {
                    panic(err)
                }
                fmt.Println(string(out))
            }
        }
    }

PLAYGROUND

  •  Tags:  
  • Related