Get Text from Web Page to String

Extract the text out of HTML string using JavaScript

Create an element, store the HTML in it, and get its textContent:





function extractContent(s) {

var span = document.createElement('span');

span.innerHTML = s;

return span.textContent || span.innerText;

};



alert(extractContent("<p>Hello</p><a href='http://w3c.org'>W3C</a>"));

Copy web-page content to a string

Yes there is a method to do this without using Internet Explorer, you can use a web request.

Here is a sample method. Basically, you are emulating the communication that would normally take place between a browser and server.

Option Explicit

Public Function getPageText(url As String)
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", url
.send
getPageText = .responseText
End With
End Function

Sub Example()
Dim url As String: url = "http://re.jrc.ec.europa.eu/pvgis5/PVcalc.php?lat=45&lon=8&peakpower=1&loss=14&optimalangles=1&outputformat=basic"
Debug.Print getPageText(url)
End Sub

Get text from web page to string

Use This:

public class ReadWebpageAsyncTask extends Activity {
private TextView textView;

/** Called when the activity is first created. */
@Override
public void onCreate(Bundle savedInstanceState) {
super.onCreate(savedInstanceState);
setContentView(R.layout.main);
textView = (TextView) findViewById(R.id.TextView01);
}

private class DownloadWebPageTask extends AsyncTask<String, Void, String> {
@Override
protected String doInBackground(String... urls) {
String response = "";
for (String url : urls) {
DefaultHttpClient client = new DefaultHttpClient();
HttpGet httpGet = new HttpGet(url);
try {
HttpResponse execute = client.execute(httpGet);
InputStream content = execute.getEntity().getContent();

BufferedReader buffer = new BufferedReader(
new InputStreamReader(content));
String s = "";
while ((s = buffer.readLine()) != null) {
response += s;
}

} catch (Exception e) {
e.printStackTrace();
}
}
return response;
}

@Override
protected void onPostExecute(String result) {
textView.setText(Html.fromHtml(result));
}
}

public void readWebpage(View view) {
DownloadWebPageTask task = new DownloadWebPageTask();
task.execute(new String[] { "http://www.google.com" });

}
}

main.xml

<?xml version="1.0" encoding="utf-8"?>
<LinearLayout xmlns:android="http://schemas.android.com/apk/res/android"
android:orientation="vertical"
android:layout_width="match_parent"
android:layout_height="match_parent"
>

<Button android:layout_height="wrap_content" android:layout_width="match_parent" android:id="@+id/readWebpage" android:onClick="readWebpage" android:text="Load Webpage"></Button>
<TextView android:id="@+id/TextView01" android:layout_width="match_parent" android:layout_height="match_parent" android:text="Example Text"></TextView>
</LinearLayout>

How to get text out of Website into String

You can use jsoup library for your purpose; a simple example to read paragraphs from the rendered HTML on the website below:

 try {
Document doc = Jsoup.connect("http://popofibo.com/pop/swaying-views-of-our-past/").get();
Elements paragraphs = doc.select("p");
for(Element p : paragraphs) {
System.out.println(p.text());
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}

Output:

It is indeed difficult to argue over the mainstream ideas of evolution
of human civilizations...

EDIT:

To demo with you HTML as a static content, you can easily use the id of your div tag:

public static void main(String... args) {
Document doc = Jsoup
.parse("<html><body><div class=\"text\" id=\"editorText\" itemprop=\"text\">I want to get this Text</div></body></html>");
Elements divs = doc.select("div#editorText");
for (Element d : divs) {
System.out.println(d.text());
}
}

Output:

I want to get this Text

Get all text as a string from page

i have used .trim() and codepen URL for reference -http://codepen.io/nagasai/pen/jrPpeK

    <html>
<head>
<h1>wtf</h1>
</head>
<body>
<div>dddiiiv</div>
<a>aaaaa</a>
<p>ppp</p>
<div>div</div>
<div id="impar"></div>
<div id="par"></div>
<div id="all"></div>

</body>
</html>

Javascript

    var elems = document.body.innerText;
var odd = "";
var even = "";
var all = "";
for (k in elems){
all = all + elems[k];
if (k % 2 === 0) {
even = (even + elems[k]).trim();
} else {
odd = (odd + elems[k]).trim();
}
}


console.log(all);
console.log(odd);
console.log(even);

Hope this is helpful

Get text from Web page

You could convert the html page to a sting. and use regex / string operations to obtain the needed data

try {
if(!url_text.getText().toString().trim().equalsIgnoreCase("")){
textView.setText("");
HttpClient client = new DefaultHttpClient();
HttpGet request = new HttpGet(url_text.getText().toString());
// Get the response
ResponseHandler<String> responseHandler = new BasicResponseHandler();
String response_str = client.execute(request, responseHandler);
textView.setText(response_str);
}else{
Toast.makeText(getApplicationContext(), "URL String empty.", Toast.LENGTH_LONG).show();
}
}
catch (Exception e) {
System.out.println("Some error occured.");
textView.setText(e.getMessage());
}

maybe the easiest way is to use sting.split function

String[] separated = response_str.split("**");
separated[0]; // part before the **
separated[1]; // your needed ip string
separated[2]; // part after the second **

How to extract text from html String


<script>

function stripHtml(html) {
var textarea = document.createElement("textarea");
textarea.innerHTML = html;
var temporalDivElement = document.createElement("p");
temporalDivElement.innerHTML = textarea.value;
return temporalDivElement.textContent || temporalDivElement.innerText || "";
}




var htmlString = `<div data-content-type="html" data-appearance="default " data-element="main"><p>The Angelina Tank Dress is simple yet sophisticated. This dress can be thrown over a swimsuit for last minute lunch plans or belted for dinner on the patio. The high-low hemline gives it the perfect amount of swing. </p><p>Features:</p><ul><li>Scoopneck</li><li>Sleeveless</li><li>Hits below the knee</li><li>Longer back hemline</li><li>Machine wash, tumble dry low</li></ul></div>`;

console.log(stripHtml(htmlString));
</script>

How to extract text from a WebPage

@Ethan, sure, I hope this is what you want, just adding the readWebpage method in the onCreate method, but I modified it and removed the View object since it is not being used,

    public class Searching_Animation_Screen extends ActionBarActivity {
TextView loading_txt;
Animation blink;
public String pre_split;
public String[] split_string;
TextView text;


@Override
protected void onCreate(Bundle savedInstanceState) {
super.onCreate(savedInstanceState);
setContentView(R.layout.activity_searchinganimationscreen);
ActionBar actionBar = getSupportActionBar();
actionBar.hide();
int width = getWindowManager().getDefaultDisplay().getWidth();
loading_txt = (TextView)findViewById(R.id.loading);
text =(TextView)findViewById(R.id.textView);
Typeface pacifico_typeface = Typeface.createFromAsset(getAssets(), "fonts/pacifico.ttf");
loading_txt.setTypeface(pacifico_typeface);
loading_txt.setTextSize(width / 20);
blink = AnimationUtils.loadAnimation(getApplicationContext(),
R.anim.blink);
loading_txt.setAnimation(blink);
Begin();

//* call webpage here,
//* note, i removed passing the view object since it is not being used
readWebpage()

}

//* (modify) by remvoving it from the code below
//* and removing the view object since it is not being used
public void readWebpage() {
DownloadWebPageTask task = new DownloadWebPageTask();
task.execute(new String[] {"http://www.google.com"});

}

private void Begin() {
Intent SEARCH_INTENT = getIntent();
pre_split=SEARCH_INTENT.getStringExtra("Search_Text");
split_string = pre_split.split(" ");
}


@Override
public boolean onCreateOptionsMenu(Menu menu) {
// Inflate the menu; this adds items to the action bar if it is present.
getMenuInflater().inflate(R.menu.menu_searchinganimationscreen, menu);
return true;
}

@Override
public boolean onOptionsItemSelected(MenuItem item) {
// Handle action bar item clicks here. The action bar will
// automatically handle clicks on the Home/Up button, so long
// as you specify a parent activity in AndroidManifest.xml.
int id = item.getItemId();

//noinspection SimplifiableIfStatement
if (id == R.id.action_settings) {
return true;
}

return super.onOptionsItemSelected(item);
}
private class DownloadWebPageTask extends AsyncTask<String, Void, String> {
String google_url ="https://www.google.com/#safe=active&q=";

@Override
protected String doInBackground(String... urls) {
String response = "";
for (String url : urls) {
DefaultHttpClient client = new DefaultHttpClient();
HttpGet httpGet = new HttpGet(url);
try {
HttpResponse execute = client.execute(httpGet);
InputStream content = execute.getEntity().getContent();

BufferedReader buffer = new BufferedReader(
new InputStreamReader(content));
String s = "";
while ((s = buffer.readLine()) != null) {
response += s;
}

} catch (Exception e) {
e.printStackTrace();
}
}
return response;
}

@Override
protected void onPostExecute(String result) {
text.setText(Html.fromHtml(result));
//throw into summarizer
}

}

}


Related Topics



Leave a reply



Submit