diff --git a/examples/web_crawler/README.md b/examples/web_crawler/README.md index c8a741ff4d..cef93eb019 100644 --- a/examples/web_crawler/README.md +++ b/examples/web_crawler/README.md @@ -1,7 +1,5 @@ # web_crawler -web_crawler is a very simple web crawler. -This web crawler fetches news from tuicool.com, -(a chinese site similar to hacker-news.firebaseio.com). +This simple web crawler fetches news from the homepage of HN (https://news.ycombinator.com/). # Compile and Run @@ -19,4 +17,4 @@ v run web_crawler.v This project shows how to use http.fetch() to get http.Response, and then html.parse() to parse the returned html. -It's easy, isn't it? +It's easy, isn't it? diff --git a/examples/web_crawler/web_crawler.v b/examples/web_crawler/web_crawler.v index da1ed3342c..ccb9b8ab2f 100644 --- a/examples/web_crawler/web_crawler.v +++ b/examples/web_crawler/web_crawler.v @@ -2,24 +2,20 @@ import net.http import net.html fn main() { - // http.fetch() sends an HTTP request to the URL with the given method and configurations. - config := http.FetchConfig{ + site_url := 'https://news.ycombinator.com' + resp := http.fetch( + url: site_url user_agent: 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0' - } - resp := http.fetch(http.FetchConfig{ ...config, url: 'https://tuicool.com' }) or { - println('failed to fetch data from the server') - return - } - // html.parse() parses and returns the DOM from the given text. + )! mut doc := html.parse(resp.body) - // html.DocumentObjectModel.get_tags_by_attribute_value() retrieves all tags in the document that have the given attribute name and value. - tags := doc.get_tags_by_attribute_value('class', 'list_article_item') - for tag in tags { - el := tag.children[1].children[0].children[0].children[0] - href := el.attributes['href'] or { panic('key not found') } - title := el.attributes['title'] or { panic('key not found') } - println('href: ${href}') - println('title: ${title}') - println('') + tags := doc.get_tags_by_attribute_value('class', 'titleline') + for i, tag in tags { + el := tag.children[0] + mut href := el.attributes['href']! + if !href.starts_with('http') { + href = '${site_url}/${href}' + } + title := el.content + println('${i + 1:2}. title: ${title:-90s} href: ${href}') } }