APACH POI mengurai HTML dengan JSOUP

Mengurai teks HTML yang berisi tag ‹font› tidak mengatur ulang ukuran dan jenis font setelah ‹/font›

Kode saya berfungsi cukup baik kecuali setelah ‹/font›.

Sebelum ‹font size=9› blablabla.. ukuran teks adalah 11. Saya berharap setelah ‹/font› ukuran teks diatur ulang menjadi 11, tetapi masih tetap di 9. Hal yang sama untuk keluarga font.

Pastinya saya salah paham cara menggunakan jsoup. Lebih baik saya menggunakan CSS, tetapi saya tidak tahu caranya.

Terimakasih atas bantuannya.

package test;
import java.awt.Color;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.math.BigInteger;
import org.apache.poi.xwpf.usermodel.UnderlinePatterns;
import org.apache.poi.xwpf.usermodel.VerticalAlign;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFRun;
import org.apache.poi.xwpf.usermodel.XWPFTable;
import org.apache.poi.xwpf.usermodel.XWPFTableCell;
import org.apache.poi.xwpf.usermodel.XWPFTableRow;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.Elements;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTblLayoutType;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.STTblLayoutType;

public class ReadHtml 
{
    protected static java.util.Vector<String> contenu = null;
    org.apache.xmlbeans.XmlCursor cursor = null;

   class WStyle
   {
      protected  String  police     =   "Times New Roman";
      protected  int     taille     =   11;
      protected  Color   couleur    =   Color.black;
      protected  boolean gras       =   false;
      protected  boolean italique   =   false;

      public WStyle() // constructeur
      {
          super();
      }

      protected String getPolice()      {return  police;}
      protected int getTaille()         {return  taille;}
      protected Color getCouleur()      {return  couleur;}
      protected boolean getGras()       {return  gras;}  
      protected boolean getItalique()   {return  italique;} 

      protected void    setPolice(String p)     {police=p;}
      protected void    setTaille(int t)        {taille=t;}
      protected void    setCouleur(Color c)     {couleur=c;}
      protected void    setGras(boolean g)      {gras=g;}  
      protected void    setItalique(boolean i)  {italique=i;} 
   }

    public ReadHtml()
    {
        super();
        contenu =  new java.util.Vector<String>();
        createWordFile();

    }



    private  XWPFParagraph getTableParagraph(XWPFTableCell  cell,  String html)
    {
            cell.removeParagraph(0);
            XWPFParagraph paragraph = cell.addParagraph();
            paragraph.setSpacingAfterLines(0);
            paragraph.setSpacingAfter(0);

            Document htmlDocument = Jsoup.parse(html);
            Elements htmlParagraphs = htmlDocument.select("p");
            for(Element htmlParagraph : htmlParagraphs) 
            {
                  System.out.println(htmlParagraph);
                  ParagraphNodeVisitor nodeVisitor = new ParagraphNodeVisitor(paragraph);
                  NodeTraversor.traverse(nodeVisitor, htmlParagraph);
            }
            return paragraph;
    }

    private   void createWordFile()
    {
          XWPFParagraph para =  null;
          try
          {
                XWPFDocument document = new XWPFDocument();
                FileOutputStream out = new FileOutputStream(new File("./", "NewTable.docx"));

                XWPFTable table = document.createTable();
                CTTblLayoutType type = table.getCTTbl().getTblPr().addNewTblLayout();
                type.setType(STTblLayoutType.FIXED);
                table.getCTTbl().addNewTblGrid().addNewGridCol().setW(BigInteger.valueOf(1670));
                table.getCTTbl().getTblGrid().addNewGridCol().setW(BigInteger.valueOf(6000));

                String myTexte =    "<html><head</head><body><p><font face=\"Verdana\" size=11>Good Morning</font>  <font size=9 face=\"Times\"> " +
                                    "<i><b>how are you today </b></i></font> Not so bad.<br>Thanks";  


                // first  line
                XWPFTableRow tableRow= table.getRow(0);
                para = getTableParagraph(tableRow.getCell(0),  "<p>Row #1,  Col. #1");
                tableRow .getCell(0).setParagraph(para);
                XWPFTableCell cell = tableRow.createCell(); 
                para = getTableParagraph(cell,  myTexte);  // Row #1,  Col. #2
                tableRow .getCell(1).setParagraph(para);

                // seconde line
                tableRow= table.createRow();
                para = getTableParagraph(tableRow.getCell(0),  "<p>Row #2,  Col. #1");
                tableRow .getCell(0).setParagraph(para);
                para = getTableParagraph(tableRow.getCell(1),  "<p>Row #2,  Col. #2");
                tableRow.getCell(1).setParagraph(para);

                document.write(out);
                document.close();
                out.close();
                System.out.println("NewTable.docx written successully");
          } 
          catch (FileNotFoundException e) {System.out.println("File exception --> "  + e.toString()); } 
          catch (IOException e) {System.out.println("I/O exception --> "  + e.toString()); }  
          catch (Exception e)  {System.out.println("Other exception --> "  + e.toString()); } 
    } 

    public class ParagraphNodeVisitor implements NodeVisitor 
    {
          String nodeName;
          String fontFace;
          String fontType;
          boolean needNewRun;
          boolean isItalic;
          boolean isBold;
          boolean isUnderlined;
          int fontSize;
          String fontColor;
          VerticalAlign align = VerticalAlign.BASELINE ;
          XWPFParagraph paragraph;
          XWPFRun run;

          ParagraphNodeVisitor(XWPFParagraph paragraph) 
          {
               this.paragraph = paragraph;
               this.run = paragraph.createRun();
               this.nodeName = "";
               this.needNewRun = false;
               this.isItalic = false;
               this.isBold = false;
               this.isUnderlined = false;
               this.fontSize = 11;
               this.fontColor = "000000";
               this.fontFace="Times";
          }

          @Override
          public void head(Node node, int depth) 
          {
              nodeName = node.nodeName();
              needNewRun = false;
              if ("#text".equals(nodeName)) 
              {
                  run.setText(((TextNode)node).text());
                  needNewRun = true; //after setting the text in the run a new run is needed
              } 
              else if ("i".equals(nodeName)) {isItalic = true;} 
              else if ("b".equals(nodeName)) {isBold = true;} 
              else if ("sup".equals(nodeName)){align = VerticalAlign.SUPERSCRIPT ;} 
              else if ("u".equals(nodeName)) {isUnderlined = true;} 
              else if ("br".equals(nodeName)) {run.addBreak();} 
              else if ("p".equals(nodeName)) {run.addBreak();} 
              else if ("font".equals(nodeName)) 
              {
                  fontColor = (!"".equals(node.attr("color")))?node.attr("color").substring(1):"000000";
                  fontSize = (!"".equals(node.attr("size")))?Integer.parseInt(node.attr("size")):11;
                  fontFace = (!"".equals(node.attr("face")))?node.attr("face"):"Times";
              } 
              if (needNewRun) run = paragraph.createRun();
              needNewRun = false;
              run.setItalic(isItalic);
              run.setBold(isBold);
              if (isUnderlined) run.setUnderline(UnderlinePatterns.SINGLE); 
              else run.setUnderline(UnderlinePatterns.NONE);
              run.setColor(fontColor); 
              run.setFontSize(fontSize);
              run.setFontFamily(fontFace);
              run.setSubscript(align);
          }

          @Override
          public void tail(Node node, int depth) 
          {

              nodeName = node.nodeName();
              System.out.println("Node=" + nodeName);
              if ("i".equals(nodeName)) {isItalic = false;} 
              else if ("b".equals(nodeName)) {isBold = false;} 
              else if ("u".equals(nodeName)) {isUnderlined = false;} 
              else if ("sup".equals(nodeName)) {align= VerticalAlign.BASELINE ;}
              else if ("font".equals("nodeName")) 
              {
                  fontColor = "000000";
                  fontSize = 11;
                  fontFace="Times";
                  System.out.println("Family=" + fontFace + "   Taille=" + fontSize);
              }

              if (needNewRun) run = paragraph.createRun();
               needNewRun = false;
               run.setItalic(isItalic);
               run.setBold(isBold);
               if (isUnderlined) run.setUnderline(UnderlinePatterns.SINGLE); else run.setUnderline(UnderlinePatterns.NONE);
               run.setColor(fontColor); 
               run.setFontSize(fontSize);

               run.setFontFamily(fontFace);
               run.setSubscript(align);
          }
    }
    public static void main(String[] args) 
    {
              new ReadHtml() ;
    } 
}

person Gerard Martinelli    schedule 24.01.2019    source sumber
comment
Kode yang Anda gunakan di sini hampir sama dengan kode yang saya berikan sebagai jawaban atas Pertanyaan terakhir Anda: stackoverflow.com/questions/54268485/. Jadi saya curiga jawaban itu telah menjawab pertanyaan Anda? Jika ya, silakan baca stackoverflow.com/help/someone-answers. Dan ini adalah pertanyaan tentang kebaikan dan keadilan setidaknya untuk menyebutkan bahwa Anda bukanlah penulis asli dari bagian utama kode yang Anda tampilkan sekarang dalam pertanyaan itu.   -  person Axel Richter    schedule 24.01.2019


Jawaban (1)


Silakan ubah baris berikut dalam metode tail Anda, dari

else if ("font".equals("nodeName")) 

to

else if ("font".equals(nodeName)) 

Anda telah membandingkan dua literal string alih-alih membandingkan string literal dengan variabel. Karena salah ketik kondisinya selalu false, oleh karena itu fontSize tidak pernah direset.

person Jakub Ch.    schedule 24.01.2019