• 大小: 48KB
    文件类型: .rar
    金币: 1
    下载: 0 次
    发布日期: 2021-05-20
  • 语言: 其他
  • 标签: DOM-TREE  c#  

资源简介

利用DOM-TREE模型对网页进行表示 对原始网页进行修正缺省标签的补充等 利用网页正文提取方法对网页进行正文提取,去除网页中的噪声信息,提取出网页中的正文、相关超链接

资源截图

代码片段和文件信息

using System;
using System.IO;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;


using mshtml;
using System.Runtime.InteropServices;

[ComVisible(true) ComImport() Guid(“7FD52380-4E07-101B-AE2D-08002B2EC713“) InterfaceTypeAttribute(ComInterfaceType.InterfaceIsIUnknown)]
public interface IPersistStreamInit
{
    void GetClassID([In Out] ref Guid pClassID);
    [return: MarshalAs(UnmanagedType.I4)]
    [PreserveSig]
    int IsDirty();
    void Load([In MarshalAs(UnmanagedType.Interface)] UCOMIStream pstm);
    void Save([In MarshalAs(UnmanagedType.Interface)] UCOMIStream pstm
     [In MarshalAs(UnmanagedType.I4)] int fClearDirty);
    void GetSizeMax([Out MarshalAs(UnmanagedType.LPArray)] long pcbSize);
    void InitNew();
}  


namespace WindowsFormsApplication1
{
    public partial class Form1 : Form
    {
        public Form1()
        {
            InitializeComponent(); 
            webBrowser1.Navigate(“HttpstyleUriParser://www.baidu.com“); 
        }

        private void webBrowser1_DocumentCompleted(object sender WebBrowserDocumentCompletedEventArgs e)
        {

        }

        private void treeView1_AfterSelect(object sender TreeViewEventArgs e)
        {

        }

        private void button1_Click(object sender EventArgs e)
        {
            if (webBrowser1.Document != null)
            {   //获取html 
                StreamReader sr = new StreamReader(webBrowser1.DocumentStream Encoding.GetEncoding(“gb2312“));
                String html = sr.ReadToEnd();
                richTextBox1.Text = html;


                //获取dom树 
                IHTMLDocument2 doc2 = Parse(html);
                IHTMLDocument3 htmldocument = (IHTMLDocument3)doc2;


                IHTMLDOMNode rootDomNode = (IHTMLDOMNode)htmldocument.documentElement;  //获取Dom树 


                TreeNode root = treeView1.Nodes.Add(“HTML“);  //跟节点 
                InsertDOMNodes(rootDomNode root);  //把其他节点插入到跟节点中 


            }
            else
            {
                MessageBox.Show(“webbrowser为空“);
            }  
        }
        unsafe IHTMLDocument2 Parse(string s)       //unsafe关键字表示不安全上下文,该上下文是任何涉及指针的操作所必需的。   
        {
            IHTMLDocument2 pDocument = new HTMLDocumentClass();
            if (pDocument != null)
            {
                IPersistStreamInit pPersist = pDocument as IPersistStreamInit;  //as运算符类似于强制转换操作;如果转换不可行,as会返回null而不是引发异常。 
                pPersist.InitNew();
                pPersist = null;
                IMarkupServices ms = pDocument as IMarkupServices;
                if (ms != null)
                {
                    IMarkupContainer pMC = null;
                    IMarkupPointer pStart pEnd;
                    ms.CreateMarkupPointer(out pStart);
                    ms.CreateMarkupPointer(o

 属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----

     文件      12288  2010-11-04 11:24  WindowsFormsApplication1\WindowsFormsApplication1\bin\Debug\WindowsFormsApplication1.exe

     文件      28160  2010-11-04 11:24  WindowsFormsApplication1\WindowsFormsApplication1\bin\Debug\WindowsFormsApplication1.pdb

     文件      14328  2010-11-04 11:25  WindowsFormsApplication1\WindowsFormsApplication1\bin\Debug\WindowsFormsApplication1.vshost.exe

     文件        490  2007-07-21 01:33  WindowsFormsApplication1\WindowsFormsApplication1\bin\Debug\WindowsFormsApplication1.vshost.exe.manifest

     文件       5357  2010-11-04 11:24  WindowsFormsApplication1\WindowsFormsApplication1\Form1.cs

     文件       3966  2010-11-04 11:24  WindowsFormsApplication1\WindowsFormsApplication1\Form1.Designer.cs

     文件       5814  2010-11-04 11:24  WindowsFormsApplication1\WindowsFormsApplication1\Form1.resx

     文件       1387  2010-11-04 11:25  WindowsFormsApplication1\WindowsFormsApplication1\obj\Debug\WindowsFormsApplication1.csproj.FileListAbsolute.txt

     文件        847  2010-11-04 11:24  WindowsFormsApplication1\WindowsFormsApplication1\obj\Debug\WindowsFormsApplication1.csproj.GenerateResource.Cache

     文件      12288  2010-11-04 11:24  WindowsFormsApplication1\WindowsFormsApplication1\obj\Debug\WindowsFormsApplication1.exe

     文件        180  2010-11-04 11:24  WindowsFormsApplication1\WindowsFormsApplication1\obj\Debug\WindowsFormsApplication1.Form1.resources

     文件      28160  2010-11-04 11:24  WindowsFormsApplication1\WindowsFormsApplication1\obj\Debug\WindowsFormsApplication1.pdb

     文件        180  2010-11-04 11:21  WindowsFormsApplication1\WindowsFormsApplication1\obj\Debug\WindowsFormsApplication1.Properties.Resources.resources

     文件        516  2010-11-04 11:01  WindowsFormsApplication1\WindowsFormsApplication1\Program.cs

     文件       1466  2010-11-04 10:18  WindowsFormsApplication1\WindowsFormsApplication1\Properties\AssemblyInfo.cs

     文件       2877  2010-11-04 10:18  WindowsFormsApplication1\WindowsFormsApplication1\Properties\Resources.Designer.cs

     文件       5612  2010-11-04 10:18  WindowsFormsApplication1\WindowsFormsApplication1\Properties\Resources.resx

     文件       1109  2010-11-04 10:18  WindowsFormsApplication1\WindowsFormsApplication1\Properties\Settings.Designer.cs

     文件        249  2010-11-04 10:18  WindowsFormsApplication1\WindowsFormsApplication1\Properties\Settings.settings

     文件       4162  2010-11-04 11:20  WindowsFormsApplication1\WindowsFormsApplication1\WindowsFormsApplication1.csproj

     文件      12288  2010-11-04 11:24  WindowsFormsApplication1\WindowsFormsApplication1.exe

     文件        962  2010-11-04 10:18  WindowsFormsApplication1\WindowsFormsApplication1.sln

    ..A..H.     18944  2010-11-04 11:25  WindowsFormsApplication1\WindowsFormsApplication1.suo

     目录          0  2010-11-04 11:26  WindowsFormsApplication1\WindowsFormsApplication1\obj\Debug\TempPE

     目录          0  2010-11-04 11:26  WindowsFormsApplication1\WindowsFormsApplication1\bin\Debug

     目录          0  2010-11-04 11:26  WindowsFormsApplication1\WindowsFormsApplication1\obj\Debug

     目录          0  2010-11-04 11:26  WindowsFormsApplication1\WindowsFormsApplication1\bin

     目录          0  2010-11-04 11:26  WindowsFormsApplication1\WindowsFormsApplication1\obj

     目录          0  2010-11-04 11:26  WindowsFormsApplication1\WindowsFormsApplication1\Properties

     目录          0  2010-11-04 11:26  WindowsFormsApplication1\WindowsFormsApplication1

............此处省略4个文件信息

评论

共有 条评论