import xml.etree.ElementTree as ELT
from tqdm import tqdm
def parse_xml_to_csv(path, save_path=None):
"""
Open xml posts dump and convert the text to a csv, tokenizing it in the process
:param path: path to the xml document containing posts
:return: a dataframe of processed text
"""
# Use python's standard library to parse xml file
doc = ELT.parse(path)
root = doc.getroot()
# Each row is a question
all_rows = [row.attrib for row in root.findall('row')]
# Using tdqm to display progress since preprocessing takes time
for item in tqdm(all_rows):
# Decode text from HTML
soup = BeautifulSoup(item['Body'], features='html.parser')
item['body_text'] = soup.get_text()
# Create dataframe from our list of dict
df = pd.DataFrame.from_dict(all_rows)
if save_path:
df.to_csv(save_path)
return df
parse_xml_to_csv("MiniPosts.xml", "1.csv")
'''
MiniPosts.xml
<?xml version="1.0" encoding="utf-8"?> <posts> <row Id="5" PostTypeId="1" CreationDate="2014-05-13T23:58:30.457" Score="9" ViewCount="516" Body="<p>I've always been interested in machine learning, but I can't figure out one thing about starting out with a simple "Hello World" example - how can I avoid hard-coding behavior?</p>

<p>For example, if I wanted to "teach" a bot how to avoid randomly placed obstacles, I couldn't just use relative motion, because the obstacles move around, but I don't want to hard code, say, distance, because that ruins the whole point of machine learning.</p>

<p>Obviously, randomly generating code would be impractical, so how could I do this?</p>
" OwnerUserId="5" LastActivityDate="2014-05-14T00:36:31.077" Title="How can I do simple machine learning without hard-coding behavior?" Tags="<machine-learning>" AnswerCount="1" CommentCount="1" FavoriteCount="1" ClosedDate="2014-05-14T14:40:25.950" /> <row Id="7" PostTypeId="1" AcceptedAnswerId="10" CreationDate="2014-05-14T00:11:06.457" Score="4" ViewCount="411" Body="<p>As a researcher and instructor, I'm looking for open-source books (or similar materials) that provide a relatively thorough overview of data science from an applied perspective. To be clear, I'm especially interested in a thorough overview that provides material suitable for a college-level course, not particular pieces or papers.</p>
" OwnerUserId="36" LastEditorUserId="97" LastEditDate="2014-05-16T13:45:00.237" LastActivityDate="2014-05-16T13:45:00.237" Title="What open-source books (or other materials) provide a relatively thorough overview of data science?" Tags="<education><open-source>" A