Getting Viewable Text Words via Nokogiri

Getting viewable text words via Nokogiri

You want the Nokogiri::XML::Node#inner_text method:

require 'nokogiri'
require 'open-uri'
html = Nokogiri::HTML(open 'http://stackoverflow.com/questions/6129357')

# Alternatively
html = Nokogiri::HTML(IO.read 'myfile.html')

text = html.at('body').inner_text

# Pretend that all words we care about contain only a-z, 0-9, or underscores
words = text.scan(/\w+/)
p words.length, words.uniq.length, words.uniq.sort[0..8]
#=> 907
#=> 428
#=> ["0", "1", "100", "15px", "2", "20", "2011", "220px", "24158nokogiri"]

# How about words that are only letters?
words = text.scan(/[a-z]+/i)
p words.length, words.uniq.length, words.uniq.sort[0..5]
#=> 872
#=> 406
#=> ["Answer", "Ask", "Badges", "Browse", "DocumentFragment", "Email"]
# Find the most frequent words
require 'pp'
def frequencies(words)
Hash[
words.group_by(&:downcase).map{ |word,instances|
[word,instances.length]
}.sort_by(&:last).reverse
]
end
pp frequencies(words)
#=> {"nokogiri"=>34,
#=> "a"=>27,
#=> "html"=>18,
#=> "function"=>17,
#=> "s"=>13,
#=> "var"=>13,
#=> "b"=>12,
#=> "c"=>11,
#=> ...

# Hrm...let's drop the javascript code out of our words
html.css('script').remove
words = html.at('body').inner_text.scan(/\w+/)
pp frequencies(words)
#=> {"nokogiri"=>36,
#=> "words"=>18,
#=> "html"=>17,
#=> "text"=>13,
#=> "with"=>12,
#=> "a"=>12,
#=> "the"=>11,
#=> "and"=>11,
#=> ...

Getting text only when nokogiri certain HTML structure

I would delete the other nodes that are in this section if you're not using the document any further.

nokogiri_object.css("div.line1 *").each(&:remove)
nokogiri_object.at_css("div.line1").text.strip # => "text I need"

Search for text nodes in Nokogiri

I have not used Nokogiri, but in standard XPath, you should be able to just use the union operator:

doc.xpath('.//text() | text()')

Nokogiri grab only visible inner_text

You could try:

require 'nokogiri'
require 'open-uri'

doc = Nokogiri::HTML(open("http://www.bodybuilding.com/store/catalog/new-products.jsp?addFacet=REF_BRAND:BRAND_MET_RX"))

doc.traverse{ |x|
if x.text? && x.text !~ /^\s*$/
puts x.text
end
}

I have not done much with Nokogiri, but I believe this should find/output all text nodes in the document that are not blanks. This at least seems to be ignoring the javascript and all the text I checked was visible on the page (though some of it in the dropdown menus).

Get text directly inside a tag in Nokogiri

To get all the direct children with text, but not any further sub-children, you can use XPath like so:

doc.xpath('//dt/text()')

Or if you wish to use search:

doc.search('dt').xpath('text()')

Using Nokogiri to read and count word output: Undefined Method

Let's take a look at what you're doing:

require 'nokogiri'
require 'open-uri'

doc = Nokogiri::HTML(open('http://www.imsdb.com/scripts/Authors-Anonymous.html'))

doc.css('b').remove
text = doc.css('pre')
text
# => [#<Nokogiri::XML::Element:0x3ff6686df65c name="pre" children=[#<Nokogiri::XML::Text:0x3ff6686df440 "\r\n\r\n\r\n">, #<Nokogiri::XML::Text:0x3ff6686def7c "\r\n\r\n\r\n Written by\r\n\r\n David Congalton\r\n\r\n\r\n\r\n\r\n July 14 2012\r\n\r\n">, #<Nokogiri::XML::Text:0x3ff6686deb1c "\r\n\r\n\r\n">, #<Nokogiri::XML::Text:0x3ff6686de694 "\r\n\r\n">, #<Nokogiri::XML::Text:0x3ff6686de20c ...

text.to_s
# => "<pre>\r\n\r\n\r\n\r\n\r\n\r\n Written by\r\n\r\n David Congalton\r\n\r\n\r\n\r\n\r\n July 14 2012\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n North Hayworth Avenue, off Sunset Boulevard. A quiet, tree-\r\n lined residential street. Note the small apartment complex\r\n set back from the curb.\r\n\r\n\r\n Our narrator is HENRY OBERT (O-BURT)(30).\r\n\r\n This is where...

text.to_s.scan(/\w+/)
# => ["pre", "Written", "by", "David", "Congalton", "July", "14", "2012", "North", "Hayworth", "Avenue", "off", "Sunset", "Boulevard", "A", "quiet", "tree", "lined", "residential", "street", "Note", "the", "small", "apartment", "complex", "set", "back", "from", "the", "curb", "Our", "narrator", "is", "HENRY", "OBERT", "O", "BURT", "30", "This", "is", "where", "where", "F", "Scott", "Fitzgerald", "died", "on", "December", "21", "1940", "INSERT", "ARCHIVAL", "PHOTOS", "of", "Fitzgerald", "H...

You're capturing the tags, parameters to those, plus the embedded text as a NodeSet, AKA, an array of Nodes. I don't think you want to do that.

Instead, I'd do something like this:

require 'nokogiri'
require 'open-uri'

def frequencies(content)
Hash[
content.group_by(&:downcase).map{ |word, instances|
[word,instances.length]
}.sort_by(&:last).reverse
]
end

doc = Nokogiri::HTML(open('http://www.imsdb.com/scripts/Authors-Anonymous.html'))

doc.css('b').remove
text = doc.css('pre').map(&:text)
text
# => ["\r\n\r\n\r\n\r\n\r\n\r\n Written by\r\n\r\n David Congalton\r\n\r\n\r\n\r\n\r\n July 14 2012\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n North Hayworth Avenue, off Sunset Boulevard. A quiet, tree-\r\n lined residential street. Note the small apartment complex\r\n set back from the curb.\r\n\r\n\r\n Our narrator is HENRY OBERT (O-BURT)(30).\r\n\r\n This is where whe...

text.join(' ')
# => "\r\n\r\n\r\n\r\n\r\n\r\n Written by\r\n\r\n David Congalton\r\n\r\n\r\n\r\n\r\n July 14 2012\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n North Hayworth Avenue, off Sunset Boulevard. A quiet, tree-\r\n lined residential street. Note the small apartment complex\r\n set back from the curb.\r\n\r\n\r\n Our narrator is HENRY OBERT (O-BURT)(30).\r\n\r\n This is where wher...

content = text.join(' ').scan(/\w+/)
# => ["Written", "by", "David", "Congalton", "July", "14", "2012", "North", "Hayworth", "Avenue", "off", "Sunset", "Boulevard", "A", "quiet", "tree", "lined", "residential", "street", "Note", "the", "small", "apartment", "complex", "set", "back", "from", "the", "curb", "Our", "narrator", "is", "HENRY", "OBERT", "O", "BURT", "30", "This", "is", "where", "where", "F", "Scott", "Fitzgerald", "died", "on", "December", "21", "1940", "INSERT", "ARCHIVAL", "PHOTOS", "of", "Fitzgerald", "His", "w...

frequencies(content)
# => {"the"=>827, "to"=>486, "i"=>398, "a"=>397, "s"=>284, "and"=>279, "in"=>273, "of"=>238, "hannah"=>234, "you"=>232, "henry"=>223, "it"=>214, "on"=>207, "her"=>200, "is"=>192, "his"=>178, "he"=>165, "for"=>162, "t"=>152, "that"=>151, "colette"=>148, "she"=>142, "at"=>137, "john"=>133, "alan"=>118, "this"=>112, "my"=>109, "up"=>105, "all"=>88, "william"=>88, "as"=>85, "what"=>84, "with"=>84, "but"=>83, "be"=>76, "camera"=>76, "not"=>74, "one"=>74, "can"=>73, "out"=>70, "m"=>69, "from"=>...

I inserted some additional steps so you can see what is being returned more easily. You can ignore those.

The idea is to ignore the tags, except to use them to grab their text content, which is what map(&:text) does.

Things to watch out for:

  • \w doesn't mean [a-z0-9], it means [a-z0-9_] which matches variable names, not what we'd consider typical words.
  • Values that are pure digits, such as "14" and "2012" needlessly clutter the results. Using reject to remove all-digit entries would probably be good because those aren't usually very useful when determining keywords and such.

Nokogiri replace inner text with span ed words

My attempt to provide a solution for your problem:

require 'nokogiri'

Inf = 1.0/0.0

def number_words(node, counter = nil)
# define infinite counter (Ruby >= 1.8.7)
counter ||= (1..Inf).each
doc = node.document

unless node.is_a?(Nokogiri::XML::Text)
# recurse for children and collect all the returned
# nodes into an array
children = node.children.inject([]) { |acc, child|
acc += number_words(child, counter)
}
# replace the node's children
node.children = Nokogiri::XML::NodeSet.new(doc, children)
return [node]
end

# for text nodes, we generate a list of span nodes
# and return it (this is more secure than OP's original
# approach that is vulnerable to HTML injection)n
node.to_s.strip.split.inject([]) { |acc, word|
span = Nokogiri::XML::Node.new("span", node)
span.content = word
span["id"] = "w#{counter.next}"
# add a space if we are not at the beginning
acc << Nokogiri::XML::Text.new(" ", doc) unless acc.empty?
# add our new span to the collection
acc << span
}
end

# demo
if __FILE__ == $0
h = <<-HTML
<p class="stanza">Thus grew the tale of Wonderland:<br/>
Thus slowly, one by one,<br/>
Its quaint events were hammered out -<br/>
And now the tale is done,<br/>
And home we steer, a merry crew,<br/>
Beneath the setting sun.<br/></p>
HTML

doc = Nokogiri::HTML.parse(h)
number_words(doc)
p doc.to_xml
end

How to extract HTML links and text using Nokogiri (and XPATH and CSS)

This is a mini-example originally written in response to Getting attribute's value in Nokogiri to extract link URLs, extracted here in Community Wiki style for easy reference.

Here are some common operations you might do when parsing links in HTTP, shown both in css and xpath syntax.

Starting with with this snippet:

require 'rubygems'
require 'nokogiri'

html = <<HTML
<div id="block1">
<a href="http://google.com">link1</a>
</div>
<div id="block2">
<a href="http://stackoverflow.com">link2</a>
<a id="tips">just a bookmark</a>
</div>
HTML

doc = Nokogiri::HTML(html)

extracting all the links

We can use xpath or css to find all the <a> elements and then keep only the ones that have an href attribute:

nodeset = doc.xpath('//a')      # Get all anchors via xpath
nodeset.map {|element| element["href"]}.compact # => ["http://google.com", "http://stackoverflow.com"]

nodeset = doc.css('a') # Get all anchors via css
nodeset.map {|element| element["href"]}.compact # => ["http://google.com", "http://stackoverflow.com"]

In the above cases, the .compact is necessary because the search for the <a> element returns the "just a bookmark" element in addition to the others.

But we can use a more refined search to find just the elements that contain an href attribute:

attrs = doc.xpath('//a/@href')  # Get anchors w href attribute via xpath
attrs.map {|attr| attr.value} # => ["http://google.com", "http://stackoverflow.com"]

nodeset = doc.css('a[href]') # Get anchors w href attribute via css
nodeset.map {|element| element["href"]} # => ["http://google.com", "http://stackoverflow.com"]

finding a specific link

To find a link within the <div id="block2">

nodeset = doc.xpath('//div[@id="block2"]/a/@href')
nodeset.first.value # => "http://stackoverflow.com"

nodeset = doc.css('div#block2 a[href]')
nodeset.first['href'] # => "http://stackoverflow.com"

If you know you're searching for just one link, you can use at_xpath or at_css instead:

attr = doc.at_xpath('//div[@id="block2"]/a/@href')
attr.value # => "http://stackoverflow.com"

element = doc.at_css('div#block2 a[href]')
element['href'] # => "http://stackoverflow.com"

find a link from associated text

What if you know the text associated with a link and want to find its url? A little xpath-fu (or css-fu) comes in handy:

element = doc.at_xpath('//a[text()="link2"]')
element["href"] # => "http://stackoverflow.com"

element = doc.at_css('a:contains("link2")')
element["href"] # => "http://stackoverflow.com"

find text from a link

For completeness, here's how you'd get the text associated with a particular link:

element = doc.at_xpath('//a[@href="http://stackoverflow.com"]')
element.text # => "link2"

element = doc.at_css('a[href="http://stackoverflow.com"]')
element.text # => "link2"

useful references

In addition to the extensive Nokorigi documentation, I came across some useful links while writing this up:

  • a handy Nokogiri cheat sheet
  • a tutorial on parsing HTML with Nokogiri
  • interactively test CSS selector queries

Scrape only visible elements with Nokogiri

You can specify to ignore hidden elements like this:

page = Nokogiri::HTML(open(url_path))
page.at_xpath("//input[not (@type='hidden')]") # get all visible input fields


Related Topics



Leave a reply



Submit