【教程】将WordPress内容导出转化为知识库TXT

方法

WP All Export Pro1插件将文章导出为XLSX格式的文件,再用C#做一个程序:

安装依赖:

Html2Markdown
ExcelDataReader
System.Text.Encoding.CodePages

ExcelToTxt类:

using ExcelDataReader;
using System.IO;
using System.Text.RegularExpressions;
using Html2Markdown;

namespace TextConvert
{
    internal class ExcelToTxt
    {
        public void ConvertToTxt(string excelFilePath)
        {
            string baseFolderPath = "./txt"; // 程序所在文件夹下的txt文件夹
            string excelFileNameWithoutExtension = Path.GetFileNameWithoutExtension(excelFilePath); // 获取Excel文件名(不包括扩展名)

            using (var stream = File.Open(excelFilePath, FileMode.Open, FileAccess.Read))
            {
                using (var reader = ExcelReaderFactory.CreateReader(stream))
                {
                    while (reader.Read())
                    {
                        string articleId = GetValueAsString(reader, 0);
                        string articleTitle = GetValueAsString(reader, 1);
                        string articleSummary = GetValueAsString(reader, 2);
                        string articleCreateDate = GetValueAsString(reader, 3);
                        string articleLink = GetValueAsString(reader, 4);
                        string category = GetValueAsString(reader, 5);
                        string tags = GetValueAsString(reader, 6);
                        string articleContent = HtmlToMarkdown(GetValueAsString(reader, 7));

                        string txtContent = $"文章ID:{articleId}\n文章标题:{articleTitle}\n文章摘要:{articleSummary}\n文章创建日期:{articleCreateDate}\n文章链接:{articleLink}\n分类目录:{category}\n标签:{tags}\n文章内容:\n{articleContent}";

                        string txtFileName = $"{SanitizeFileName(articleId)}-{SanitizeFileName(articleTitle)}.txt";
                        string secondLevelFolderPath = Path.Combine(baseFolderPath, excelFileNameWithoutExtension);

                        // 检查二级文件夹是否存在,如果不存在则创建
                        if (!Directory.Exists(secondLevelFolderPath))
                        {
                            Directory.CreateDirectory(secondLevelFolderPath);
                        }

                        // 将txt文件写入二级文件夹
                        string fullPath = Path.Combine(secondLevelFolderPath, txtFileName);
                        File.WriteAllText(fullPath, txtContent);
                    }
                }
            }
        }

        private string SanitizeFileName(string name)
        {
            var invalidChars = Path.GetInvalidFileNameChars();
            return new string(name.Where(ch => !invalidChars.Contains(ch)).ToArray());
        }

        private string GetValueAsString(IExcelDataReader reader, int columnIndex)
        {
            object value = reader.GetValue(columnIndex);
            if (value != null)
            {
                if (value is double) // 如果是数字类型,则转换为字符串
                {
                    return ((double)value).ToString();
                }
                else
                {
                    return value.ToString();
                }
            }
            return string.Empty;
        }

        public static string HtmlToMarkdown(string html)
        {
            // Convert <p> tags to Markdown paragraphs
            string markdown = Regex.Replace(html, @"<p.*?>", "", RegexOptions.IgnoreCase);

            var converter = new Converter();
            markdown = converter.Convert(markdown);

            markdown = Regex.Replace(markdown, @"</p>", "\n\n", RegexOptions.IgnoreCase);

            // Convert <strong> tags to Markdown bold
            markdown = Regex.Replace(markdown, @"<strong.*?>(.*?)</strong>", "**$1**", RegexOptions.IgnoreCase);

            // Convert <em> tags to Markdown italic
            markdown = Regex.Replace(markdown, @"<em.*?>(.*?)</em>", "_$1_", RegexOptions.IgnoreCase);

            // Convert <ul> and <ol> tags to Markdown lists
            markdown = Regex.Replace(markdown, @"<ul.*?>", "", RegexOptions.IgnoreCase);
            markdown = Regex.Replace(markdown, @"</ul>", "", RegexOptions.IgnoreCase);
            markdown = Regex.Replace(markdown, @"<ol.*?>", "", RegexOptions.IgnoreCase);
            markdown = Regex.Replace(markdown, @"</ol>", "", RegexOptions.IgnoreCase);
            markdown = Regex.Replace(markdown, @"<li.*?>(.*?)</li>", "- $1", RegexOptions.IgnoreCase);

            // Convert <h1> to <h6> tags to Markdown headers
            markdown = Regex.Replace(markdown, @"<h([1-6]).*?>(.*?)</h\1>", "######$2", RegexOptions.IgnoreCase);

            // Remove other HTML tags
            markdown = Regex.Replace(markdown, @"<[^>]*>", "", RegexOptions.IgnoreCase);

            // Replace HTML entities
            markdown = Regex.Replace(markdown, @"&nbsp;", " ", RegexOptions.IgnoreCase);
            markdown = Regex.Replace(markdown, @"&amp;", "&", RegexOptions.IgnoreCase);
            markdown = Regex.Replace(markdown, @"&lt;", "<", RegexOptions.IgnoreCase);
            markdown = Regex.Replace(markdown, @"&gt;", ">", RegexOptions.IgnoreCase);
            markdown = Regex.Replace(markdown, @"&quot;", "\"", RegexOptions.IgnoreCase);
            markdown = Regex.Replace(markdown, @"&#39;", "'", RegexOptions.IgnoreCase);

            markdown = Regex.Replace(markdown, @"^\s*\n", "", RegexOptions.Multiline);

            return markdown.Trim();
        }
    }
}

调用方法:

using System.IO;
using System.Text;
using System.Windows;

public void Main()
{ 
    // 注册编码提供程序
    Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
    ExcelToTxt excelToTxt = new ExcelToTxt();
    string directoryPath = "./excel"; // 当前目录下的excel文件夹
    string excelExtension = ".xls|.xlsx"; // Excel文件扩展名

    // 检查目录是否存在,如果不存在则创建
    if (!Directory.Exists(directoryPath))
    {
        Directory.CreateDirectory(directoryPath);
    }
    else
    {
        // 遍历目录中的所有Excel文件
        string[] excelFiles = Directory.GetFiles(directoryPath, "*.*", SearchOption.TopDirectoryOnly)
            .Where(f => excelExtension.Contains(new FileInfo(f).Extension.ToLower())).ToArray();

        foreach (string excelFile in excelFiles)
        {
            // 调用ConvertToTxt方法
            // Console.WriteLine($"正在处理文件:{excelFile}");
            excelToTxt.ConvertToTxt(excelFile);
        }
    }
}

由于导出来是html格式的,所以先将html格式的转为markdown减少文字,再去除无用标记符号,再去除多余换行,这样得出来的txt就可以放到知识库中转化了,可以试试小站右侧的AI小助手,我就是这样做的。

脚注

  1. 下载链接:
    分享名称:WP All Export Pro
    分享链接:https://kb.itpno.com/#s/-b0tgO9w
    访问密码:iTPno. ↩︎
暂无评论

发送评论 编辑评论


				
|´・ω・)ノ
ヾ(≧∇≦*)ゝ
(☆ω☆)
(╯‵□′)╯︵┴─┴
 ̄﹃ ̄
(/ω\)
∠( ᐛ 」∠)_
(๑•̀ㅁ•́ฅ)
→_→
୧(๑•̀⌄•́๑)૭
٩(ˊᗜˋ*)و
(ノ°ο°)ノ
(´இ皿இ`)
⌇●﹏●⌇
(ฅ´ω`ฅ)
(╯°A°)╯︵○○○
φ( ̄∇ ̄o)
ヾ(´・ ・`。)ノ"
( ง ᵒ̌皿ᵒ̌)ง⁼³₌₃
(ó﹏ò。)
Σ(っ °Д °;)っ
( ,,´・ω・)ノ"(´っω・`。)
╮(╯▽╰)╭
o(*////▽////*)q
>﹏<
( ๑´•ω•) "(ㆆᴗㆆ)
😂
😀
😅
😊
🙂
🙃
😌
😍
😘
😜
😝
😏
😒
🙄
😳
😡
😔
😫
😱
😭
💩
👻
🙌
🖕
👍
👫
👬
👭
🌚
🌝
🙈
💊
😶
🙏
🍦
🍉
😣
Source: github.com/k4yt3x/flowerhd
Source: https://github.com/MengXi2021/Argon-Emoji-DailyNotes
Source: https://github.com/Ghost-chu/argon-huhu-emotions
Source: github.com/zhheo/Sticker-Heo
颜文字
Emoji
小恐龙
花!
每日手帐
呼呼
Heo
上一篇
下一篇