fix: parse HTML lists within a status

Updated the ConvertHTMLToText function to properly parse and convert
both ordered and unordered lists to plain text.
This commit is contained in:
Dan Anglin 2024-06-01 08:29:45 +01:00
parent 5cf6116fe8
commit 6a799ff5ac
Signed by: dananglin
GPG key ID: 0C1D44CFBEE68638
4 changed files with 47 additions and 14 deletions

View file

@ -88,7 +88,7 @@ func (a Account) Display(noColor bool) string {
metadata += fmt.Sprintf(
"\n %s: %s",
utilities.FieldFormat(noColor, field.Name),
utilities.StripHTMLTags(field.Value),
utilities.ConvertHTMLToText(field.Value),
)
}
@ -105,7 +105,7 @@ func (a Account) Display(noColor bool) string {
utilities.FieldFormat(noColor, "Following:"), a.FollowingCount,
utilities.FieldFormat(noColor, "Statuses:"), a.StatusCount,
utilities.HeaderFormat(noColor, "BIOGRAPHY:"),
utilities.WrapLines(utilities.StripHTMLTags(a.Note), "\n ", 80),
utilities.WrapLines(utilities.ConvertHTMLToText(a.Note), "\n ", 80),
utilities.HeaderFormat(noColor, "METADATA:"),
metadata,
utilities.HeaderFormat(noColor, "ACCOUNT URL:"),

View file

@ -180,7 +180,7 @@ func (s Status) Display(noColor bool) string {
format,
utilities.DisplayNameFormat(noColor, s.Account.DisplayName), s.Account.Username,
utilities.HeaderFormat(noColor, "CONTENT:"),
utilities.WrapLines(utilities.StripHTMLTags(s.Content), "\n ", 80),
utilities.WrapLines(utilities.ConvertHTMLToText(s.Content), "\n ", 80),
utilities.HeaderFormat(noColor, "STATUS ID:"),
s.ID,
utilities.HeaderFormat(noColor, "CREATED AT:"),

View file

@ -30,7 +30,7 @@ func (t Timeline) Display(noColor bool) string {
createdAt = status.Reblog.CreatedAt
}
builder.WriteString(utilities.WrapLines(utilities.StripHTMLTags(status.Content), "\n", 80) + "\n\n")
builder.WriteString(utilities.WrapLines(utilities.ConvertHTMLToText(status.Content), "\n", 80) + "\n\n")
builder.WriteString(utilities.FieldFormat(noColor, "ID:") + " " + statusID + "\t" + utilities.FieldFormat(noColor, "Created at:") + " " + utilities.FormatTime(createdAt) + "\n")
builder.WriteString(separator + "\n")
}

View file

@ -1,16 +1,34 @@
package utilities
import (
"io"
"strconv"
"strings"
"golang.org/x/net/html"
)
func StripHTMLTags(text string) string {
token := html.NewTokenizer(strings.NewReader(text))
const (
htmlNoList int = iota
htmlOrderedList
htmlUnorderedList
)
type htmlConvertState struct {
htmlListType int
orderedListIndex int
}
func ConvertHTMLToText(text string) string {
var builder strings.Builder
state := htmlConvertState{
htmlListType: htmlNoList,
orderedListIndex: 1,
}
token := html.NewTokenizer(strings.NewReader(text))
for {
tt := token.Next()
switch tt {
@ -21,18 +39,33 @@ func StripHTMLTags(text string) string {
builder.WriteString(text)
case html.StartTagToken, html.EndTagToken:
tag := token.Token().String()
builder.WriteString(transformTag(tag))
processTagToken(&state, &builder, tag)
}
}
}
func transformTag(tag string) string {
func processTagToken(state *htmlConvertState, writer io.StringWriter, tag string) {
switch tag {
case "<br>":
return "\n"
case "<p>", "</p>":
return "\n"
case "<br>", "<p>", "</p>", "</li>":
_, _ = writer.WriteString("\n")
case "<ul>":
state.htmlListType = htmlUnorderedList
_, _ = writer.WriteString("\n")
case "<ol>":
state.htmlListType = htmlOrderedList
_, _ = writer.WriteString("\n")
case "</ul>":
state.htmlListType = htmlNoList
case "</ol>":
state.htmlListType = htmlNoList
state.orderedListIndex = 1
case "<li>":
switch state.htmlListType {
case htmlUnorderedList:
_, _ = writer.WriteString("• ")
case htmlOrderedList:
_, _ = writer.WriteString(strconv.Itoa(state.orderedListIndex) + ". ")
state.orderedListIndex++
}
}
return ""
}