Text Extraction

การสกัดข้อความ — Aspose.Note FOSS สำหรับ Python

Aspose.Note FOSS for Python exposes the full text content of every OneNote page through the RichText node. แต่ละ RichText ถือทั้งข้อความ plain-text .Text string และ .TextRuns รายการของสไตล์ที่กำหนดแยกกัน TextRun segments. หน้านี้บันทึกรูปแบบการสกัดข้อความที่มีให้ทั้งหมด.

สกัดข้อความธรรมดาทั้งหมด

วิธีที่เร็วที่สุดในการดึงข้อความทั้งหมดจากเอกสารคือ GetChildNodes(RichText), ซึ่งทำการท่องผ่านแบบลึก-ก่อน-กว้างแบบเรียกซ้ำทั่วทั้ง DOM:

from aspose.note import Document, RichText

doc = Document("MyNotes.one")
for rt in doc.GetChildNodes(RichText):
    if rt.Text:
        print(rt.Text)

รวบรวมเป็นรายการและเชื่อมต่อ:

from aspose.note import Document, RichText

doc = Document("MyNotes.one")
all_text = "\n".join(
    rt.Text for rt in doc.GetChildNodes(RichText) if rt.Text
)

สกัดข้อความต่อหน้า

จัดระเบียบข้อความที่สกัดตามชื่อหน้า:

from aspose.note import Document, Page, RichText

doc = Document("MyNotes.one")
for page in doc.GetChildNodes(Page):
    title = (
        page.Title.TitleText.Text
        if page.Title and page.Title.TitleText
        else "(untitled)"
    )
    print(f"\n=== {title} ===")
    for rt in page.GetChildNodes(RichText):
        if rt.Text:
            print(rt.Text)

ตรวจสอบการจัดรูปแบบ Run

RichText.TextRuns คือรายการของ TextRun objects. แต่ละรันครอบคลุมช่วงอักขระต่อเนื่องที่มีความสม่ำเสมอ TextStyle:

from aspose.note import Document, RichText

doc = Document("MyNotes.one")
for rt in doc.GetChildNodes(RichText):
    for run in rt.TextRuns:
        style = run.Style
        parts = []
        if style.IsBold:          parts.append("bold")
        if style.IsItalic:        parts.append("italic")
        if style.IsUnderline:     parts.append("underline")
        if style.IsStrikethrough: parts.append("strikethrough")
        if style.IsSuperscript:   parts.append("superscript")
        if style.IsSubscript:     parts.append("subscript")
        if style.FontName:      parts.append(f"font={style.FontName!r}")
        if style.FontSize:      parts.append(f"size={style.FontSize}pt")
        label = ", ".join(parts) if parts else "plain"
        print(f"[{label}] {run.Text!r}")

อ้างอิงคุณสมบัติ TextStyle

คุณสมบัติ	ประเภท	คำอธิบาย
`IsBold`	`bool`	ข้อความหนา
`IsItalic`	`bool`	ข้อความเอียง
`IsUnderline`	`bool`	ข้อความขีดเส้นใต้
`IsStrikethrough`	`bool`	ข้อความขีดฆ่า
`IsSuperscript`	`bool`	ตัวห้อย
`IsSubscript`	`bool`	ตัวห้อยล่าง
`FontName`	`str	None`
`FontSize`	`float	None`
`FontColor`	`int	None`
`Highlight`	`int	None`
`Language`	`int	None`
`IsHyperlink`	`bool`	ว่าการทำงานนี้เป็นไฮเปอร์ลิงก์หรือไม่
`HyperlinkAddress`	`str	None`

สกัดลิงก์ไฮเปอร์

ไฮเปอร์ลิงก์ถูกจัดเก็บที่ TextRun ระดับ ตรวจสอบ Style.IsHyperlink:

from aspose.note import Document, RichText

doc = Document("MyNotes.one")
for rt in doc.GetChildNodes(RichText):
    for run in rt.TextRuns:
        if run.Style.IsHyperlink and run.Style.HyperlinkAddress:
            print(f"  {run.Text!r:40s} -> {run.Style.HyperlinkAddress}")

สกัดข้อความหนาและไฮไลท์

กรอง run ตามคุณสมบัติการจัดรูปแบบเพื่อแยกเนื้อหาเฉพาะ:

from aspose.note import Document, RichText

doc = Document("MyNotes.one")
print("=== Bold segments ===")
for rt in doc.GetChildNodes(RichText):
    for run in rt.TextRuns:
        if run.Style.IsBold and run.Text.strip():
            print(f"  {run.Text.strip()!r}")

print("\n=== Highlighted segments ===")
for rt in doc.GetChildNodes(RichText):
    for run in rt.TextRuns:
        if run.Style.Highlight is not None and run.Text.strip():
            color = f"#{run.Style.Highlight & 0xFFFFFF:06X}"
            print(f"  [{color}] {run.Text.strip()!r}")

สกัดข้อความจากบล็อกหัวเรื่อง

ชื่อหน้าเป็น RichText โหนดภายใน Title อ็อบเจ็กต์ พวกมันจะไม่ถูกส่งคืนโดยระดับบนสุด GetChildNodes(RichText) บนหน้าเว้นแต่คุณรวม Title ส่วนย่อย เข้าถึงโดยตรง:

from aspose.note import Document, Page

doc = Document("MyNotes.one")
for page in doc.GetChildNodes(Page):
    if page.Title:
        if page.Title.TitleText:
            print("Title text:", page.Title.TitleText.Text)
        if page.Title.TitleDate:
            print("Title date:", page.Title.TitleDate.Text)
        if page.Title.TitleTime:
            print("Title time:", page.Title.TitleTime.Text)

สกัดข้อความจากตาราง

เซลล์ตารางประกอบด้วย RichText ลูกหลาน ใช้การซ้อน GetChildNodes การเรียก:

from aspose.note import Document, Table, TableRow, TableCell, RichText

doc = Document("MyNotes.one")
for table in doc.GetChildNodes(Table):
    for row in table.GetChildNodes(TableRow):
        row_values = []
        for cell in row.GetChildNodes(TableCell):
            cell_text = " ".join(
                rt.Text for rt in cell.GetChildNodes(RichText)
            ).strip()
            row_values.append(cell_text)
        print(row_values)

การดำเนินการข้อความในหน่วยความจำ

แทนที่ข้อความ

RichText.Replace(old_value, new_value) แทนที่ข้อความในหน่วยความจำระหว่างการทำงานทั้งหมด:

from aspose.note import Document, RichText

doc = Document("MyNotes.one")
for rt in doc.GetChildNodes(RichText):
    rt.Replace("TODO", "DONE")
##Changes are in-memory only; saving back to .one is not supported

เพิ่ม run ของข้อความ

from aspose.note import Document, RichText, TextStyle

doc = Document("MyNotes.one")
for rt in doc.GetChildNodes(RichText):
    rt.Append(" [reviewed]")  # appends with default style
    break  # just the first node in this example

บันทึกข้อความที่สกัดออกไปยังไฟล์

import sys
from aspose.note import Document, RichText

if hasattr(sys.stdout, "reconfigure"):
    sys.stdout.reconfigure(encoding="utf-8", errors="replace")

doc = Document("MyNotes.one")
lines = [rt.Text for rt in doc.GetChildNodes(RichText) if rt.Text]

with open("extracted.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(lines))

print(f"Extracted {len(lines)} text blocks.")

เคล็ดลับ

GetChildNodes(RichText) บน Document ค้นหา ทั้งหมด ต้นไม้ที่รวมทุกหน้า, โครงร่าง, และองค์ประกอบของโครงร่าง. เรียกใช้บนเฉพาะ Page เพื่อจำกัดขอบเขต.
ตรวจสอบเสมอ rt.Text (หรือ if rt.Text:) RichText โหนดที่ว่างเปล่ามีอยู่ในเอกสารบางฉบับ.
บน Windows, ปรับค่าใหม่ sys.stdout เป็น UTF-8 เพื่อหลีกเลี่ยง UnicodeEncodeError เมื่อพิมพ์อักขระที่อยู่นอกหน้าโค้ดของระบบ.
TextRun มีเพียง Text และ Style ฟิลด์. ไม่มี Start/End คุณสมบัติ offset; เพื่อค้นหาข้อความของรันภายในพาเรนท์ RichText.Text, run.Text ภายใน rt.Text ด้วยตนเอง.