是免費軟體,值得推一下… 很方便的工具
Xpath Help 是安裝Google Chrome 的擴充套件,方便自動取得xpath 路徑
點選右上角xpath help後, 出現下面的黑框,在左邊的
設定xml檔
<?xml version="1.0" encoding="utf-8" ?>
<etmall>
<ProductName>//*[@id="productDetail"]/div[2]/section/section/div[1]/h3</ProductName>
<ProductImg>//*[@id="prodSlider"]/div[2]/a/img</ProductImg>
</etmall>
|
示範:抓取這兩個xpath 路徑,做成xml檔,方便日後更新xpath
程式架構說明
Public class HtmlAgilityPackService
{
private XmlDocument _xmldoc = new XmlDocument();
private string _path = Directory.GetCurrentDirectory();
private HtmlAgilityPack.HtmlDocument _doc = new HtmlAgilityPack.HtmlDocument();
private WebBrowser _web = new WebBrowser();
public HtmlAgilityPackService(string URL)
{
//讀取XML檔,指定目錄位置
_xmldoc.Load(_path + "\\PathStructure.xml");
_web.ScriptErrorsSuppressed = true;
_web.Navigate(URL);
waitTillLoad(_web);//取得HTML值
_doc.Load(new StringReader(_web.Document.Body.OuterHtml));
}
//讀取html ,值到讀取狀態complete
private void waitTillLoad(WebBrowser webBrControl)
{
WebBrowserReadyState loadStatus;
int waittime = 10;
int counter = 0;
while (true)
{
loadStatus = webBrControl.ReadyState;
Application.DoEvents();
if ((counter > waittime) ||
(loadStatus == WebBrowserReadyState.Uninitialized) ||
(loadStatus == WebBrowserReadyState.Loading) ||
(loadStatus == WebBrowserReadyState.Interactive))
{
break;
}
counter++;
}
counter = 0;
while (true)
{
loadStatus = webBrControl.ReadyState;
Application.DoEvents();
if ((loadStatus == WebBrowserReadyState.Complete||
loadStatus==WebBrowserReadyState.Interactive)
&& webBrControl.IsBusy != true)
{
break;
}
counter++;
}
}
//取得產品名稱
public string GetName()
{
try
{
XmlNode productName = _xmldoc.DocumentElement.SelectSingleNode("/etmall/ProductName");
HtmlNodeCollection name = _doc.DocumentNode.SelectNodes(productName.InnerText);
if (name.Count > 0)
return name[0].InnerText;
else
return "error";
}
catch (Exception)
{
return "error";
}
}
//取得產品圖片
public string GetImage()
{
try
{
XmlNode xmlImage =
_xmldoc.DocumentElement.SelectSingleNode("/etmall/ProductImg");
//取得XML檔案的XPATH設定值
HtmlNodeCollection image = _d
doc.DocumentNode.SelectNodes(xmlImage.InnerText);
if (image.Count > 0)
{
HtmlAttribute imageAttribute = image[0].Attributes["src"];
return imageAttribute.Value; //傳回IMAGE的來源位址
}
else {
return "error";
}
}
catch (Exception)
{
return "error";
}
}
}
|
爬文的範例:
HtmlAgilityPackService htmlAiliPackService = new HtmlAgilityPackService(textBox1.Text);
string productName = htmlAiliPackService.GetName();
string productImage = htmlAiliPackService.GetImage();
|
以上說明完畢~ 透過XPATH 取得DOM節點,變得更方便囉!
沒有留言:
張貼留言