79 lines
4.3 KiB
Markdown
79 lines
4.3 KiB
Markdown
---
|
|
title: Web::Scraperの参考になるサイト
|
|
author: kazu634
|
|
date: 2009-04-15
|
|
wordtwit_post_info:
|
|
- 'O:8:"stdClass":13:{s:6:"manual";b:0;s:11:"tweet_times";i:1;s:5:"delay";i:0;s:7:"enabled";i:1;s:10:"separation";s:2:"60";s:7:"version";s:3:"3.7";s:14:"tweet_template";b:0;s:6:"status";i:2;s:6:"result";a:0:{}s:13:"tweet_counter";i:2;s:13:"tweet_log_ids";a:1:{i:0;i:4559;}s:9:"hash_tags";a:0:{}s:8:"accounts";a:1:{i:0;s:7:"kazu634";}}'
|
|
categories:
|
|
- Perl
|
|
|
|
---
|
|
<div class="section">
|
|
<p>
|
|
Web::Scraperを使うたびに検索するのが面倒くさいので、ここに役立つサイトを貼っておきます。
|
|
</p>
|
|
|
|
<p>
|
|
<a href="http://e8y.net/mag/013-web-scraper/" onclick="__gaTracker('send', 'event', 'outbound-article', 'http://e8y.net/mag/013-web-scraper/', 'use Web::Scraper; – 今日のCPANモジュール');" target="_blank">use Web::Scraper; – 今日のCPANモジュール</a>
|
|
</p>
|
|
|
|
<h4>
|
|
今日作成したスクリプト
|
|
</h4>
|
|
|
|
<p>
|
|
「<a href="http://www.e-kotoba.net/" onclick="__gaTracker('send', 'event', 'outbound-article', 'http://www.e-kotoba.net/', 'いい言葉ねっと By Shu');" target="_blank">いい言葉ねっと By Shu</a>」の今日のいい言葉を取得します。
|
|
</p>
|
|
|
|
<pre class="syntax-highlight">
|
|
<span class="synComment"># === Libraries ===</span>
|
|
<span class="synStatement">use strict</span>;
|
|
<span class="synStatement">use warnings</span>;
|
|
<span class="synComment"># === Libraries ===</span>
|
|
<span class="synStatement">use </span>URI;
|
|
<span class="synStatement">use </span>Web::Scraper;
|
|
<span class="synStatement">use </span>YAML;
|
|
<span class="synStatement">use </span>Encode;
|
|
<span class="synStatement">use utf8</span>;
|
|
<span class="synComment"># === Main part ===</span>
|
|
<span class="synStatement">my</span> <span class="synIdentifier">$scraper</span> = scraper {
|
|
process <span class="synConstant">'/html/body/div/div/div/div[4]/div/div[2]/div[2]/table/tr/td/div/strong'</span>,
|
|
<span class="synConstant">'body'</span> => <span class="synConstant">'TEXT'</span>;
|
|
};
|
|
<span class="synStatement">my</span> <span class="synIdentifier">$result</span> = <span class="synIdentifier">$scraper</span>->scrape( URI-><span class="synStatement">new</span>(<span class="synConstant">"http://www.e-kotoba.net/"</span>) );
|
|
<span class="synStatement">print</span> encode(<span class="synConstant">'utf8'</span>, <span class="synIdentifier">$result</span>->{body});
|
|
<span class="synComment"># Reference: </span>
|
|
<span class="synComment"># Below is the sample</span>
|
|
<span class="synComment"># (1) <div class="ekotoba">...</div>の部分をbodyに取得</span>
|
|
<span class="synComment"># process 'div.ekotoba', 'body' => 'TEXT';</span>
|
|
<span class="synComment"># (2) XPATHで指定</span>
|
|
<span class="synComment"># Firebugsなどを用いる。</span>
|
|
</pre>
|
|
|
|
<h4>
|
|
作成するときのテンプレート
|
|
</h4>
|
|
|
|
<pre class="syntax-highlight">
|
|
<span class="synComment"># === Libraries ===</span>
|
|
<span class="synStatement">use </span>URI;
|
|
<span class="synStatement">use </span>Web::Scraper;
|
|
<span class="synStatement">use </span>YAML;
|
|
<span class="synStatement">use </span>Encode;
|
|
<span class="synStatement">use utf8</span>;
|
|
<span class="synComment"># === Main part ===</span>
|
|
<span class="synStatement">my</span> <span class="synIdentifier">$scraper</span> = scraper {
|
|
process <span class="synConstant">'XPath'</span>,
|
|
<span class="synComment"># Reference: http://e8y.net/mag/013-web-scraper/</span>
|
|
<span class="synComment"># Below is the sample</span>
|
|
<span class="synComment"># (1) <div class="ekotoba">...</div>の部分をbodyに取得</span>
|
|
<span class="synComment"># process 'div.ekotoba', 'body' => 'TEXT';</span>
|
|
<span class="synComment"># (2) XPATHで指定</span>
|
|
<span class="synComment"># Firebugsなどを用いる。</span>
|
|
};
|
|
<span class="synStatement">my</span> <span class="synIdentifier">$result</span> = <span class="synIdentifier">$scraper</span>->scrape( URI-><span class="synStatement">new</span>(<span class="synConstant">"http://google.com/"</span> ));
|
|
<span class="synStatement">print</span> encode(<span class="synConstant">'utf8'</span>, YAML::Dump(<span class="synIdentifier">$result</span>));
|
|
<span class="synComment"># print encode('utf8', YAML::Dump($result->{body}));</span>
|
|
</pre>
|
|
</div>
|